From 0be31357cf74385acbc47226feba4e89a044951b Mon Sep 17 00:00:00 2001 From: Eliyan Date: Tue, 14 Oct 2025 11:18:47 +0200 Subject: [PATCH] [WIP] remove zod and use actual schema --- README.md | 15 +-- poc.js | 379 +++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 327 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index be97e8f..180f753 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ This is a **Proof of Concept (PoC)** that demonstrates the conversion of natural ## Installation 1. Install dependencies: + ```bash npm install ``` @@ -63,14 +64,8 @@ The PoC generates ODMDB queries in this format: ```json { "object": "seekers", - "condition": [ - "prop.dt_create(>=:2025-10-06)" - ], - "fields": [ - "alias", - "email", - "seekworkingyear" - ] + "condition": ["prop.dt_create(>=:2025-10-06)"], + "fields": ["alias", "email", "seekworkingyear"] } ``` @@ -85,6 +80,7 @@ The PoC understands and generates these ODMDB DSL patterns: ## Field Mappings Currently supports mapping for seekers object: + - `email` → `email` - `experience` → `seekworkingyear` - `job titles` → `seekjobtitleexperience` @@ -93,6 +89,7 @@ Currently supports mapping for seekers object: ## Schema Context The PoC can optionally load schema files for context: + - `main.json` - Combined schema definitions - `lg.json` - Localization/language mappings @@ -116,4 +113,4 @@ The PoC can optionally load schema files for context: - `poc.js` - Main PoC implementation - `package.json` - Dependencies and scripts - `main.json` - Optional schema context (if available) -- `lg.json` - Optional localization context (if available) \ No newline at end of file +- `lg.json` - Optional localization context (if available) diff --git a/poc.js b/poc.js index 9ec829a..f1e514e 100644 --- a/poc.js +++ b/poc.js @@ -1,22 +1,21 @@ -// PoC: NL → ODMDB query (seekers) +// PoC: NL → ODMDB query (seekers), no zod — validate via ODMDB schema // Usage: // 1) export OPENAI_API_KEY=sk-... // 2) node poc.js import fs from "node:fs"; import OpenAI from "openai"; -import { z } from "zod"; // ---- Config ---- const MODEL = process.env.OPENAI_MODEL || "gpt-5"; -const MAIN_SCHEMA_PATH = "./main.json"; // optional context; safe if missing -const LG_SCHEMA_PATH = "./lg.json"; // optional context; safe if missing +const MAIN_SCHEMA_PATH = "./main.json"; // optional context; used for validation +const LG_SCHEMA_PATH = "./lg.json"; // optional context // Hardcoded NL query for the PoC (no multi-turn) const NL_QUERY = "give me new seekers since last week with email and experience"; -// ---- Load schemas if present (not required for output) ---- +// ---- Load schemas (safe) ---- function loadJsonSafe(path) { try { if (fs.existsSync(path)) { @@ -30,60 +29,255 @@ const SCHEMAS = { lg: loadJsonSafe(LG_SCHEMA_PATH), }; -// ---- Seekers mapping (from our agreement) ---- +// ---- Helpers to read seekers field names from your ODMDB custom schema ---- +function extractSeekersPropsFromOdmdbSchema(main) { + if (!main) return []; + + // Try common shapes + // 1) { objects: { seekers: { properties: {...} } } } + if ( + main.objects?.seekers?.properties && + typeof main.objects.seekers.properties === "object" + ) { + return Object.keys(main.objects.seekers.properties); + } + + // 2) If main is an array, search for an item that looks like seekers schema + if (Array.isArray(main)) { + for (const entry of main) { + const keys = extractSeekersPropsFromOdmdbSchema(entry); + if (keys.length) return keys; + } + } + + // 3) Fallback: deep search for a { seekers: { properties: {...} } } node + try { + const stack = [main]; + while (stack.length) { + const node = stack.pop(); + if (node && typeof node === "object") { + if ( + node.seekers?.properties && + typeof node.seekers.properties === "object" + ) { + return Object.keys(node.seekers.properties); + } + for (const v of Object.values(node)) { + if (v && typeof v === "object") stack.push(v); + } + } + } + } catch {} + + return []; +} + +// ---- Schema-based mapping system ---- +class SchemaMapper { + constructor(schemas) { + this.schemas = schemas.main || []; + this.seekersSchema = this.findSchemaByType("seekers"); + this.fieldMappings = this.buildFieldMappings(); + this.indexMappings = this.buildIndexMappings(); + } + + findSchemaByType(objectType) { + if (!this.schemas || !Array.isArray(this.schemas)) return null; + return this.schemas.find( + (schema) => schema.$id && schema.$id.includes(`/${objectType}`) + ); + } + + buildFieldMappings() { + if (!this.seekersSchema) return {}; + + const mappings = {}; + const properties = this.seekersSchema.properties || {}; + + Object.entries(properties).forEach(([fieldName, fieldDef]) => { + const synonyms = this.generateSynonyms(fieldName, fieldDef); + mappings[fieldName] = { + field: fieldName, + title: fieldDef.title?.toLowerCase(), + description: fieldDef.description?.toLowerCase(), + type: fieldDef.type, + synonyms, + }; + + // Index by title and synonyms + if (fieldDef.title) { + mappings[fieldDef.title.toLowerCase()] = fieldName; + } + synonyms.forEach((synonym) => { + mappings[synonym.toLowerCase()] = fieldName; + }); + }); + + return mappings; + } + + buildIndexMappings() { + if (!this.seekersSchema?.apxidx) return {}; + + const indexes = {}; + this.seekersSchema.apxidx.forEach((idx) => { + indexes[idx.name] = { + name: idx.name, + type: idx.type, + keyval: idx.keyval, + }; + }); + + return indexes; + } + + generateSynonyms(fieldName, fieldDef) { + const synonyms = []; + + // Common mappings based on actual schema + const commonMappings = { + email: ["contact", "mail", "contact email"], + seekworkingyear: ["experience", "years of experience", "work experience"], + seekjobtitleexperience: ["job titles", "job experience", "positions"], + seekstatus: ["status", "availability", "looking"], + dt_create: ["created", "creation date", "new", "recent", "since"], + salaryexpectation: ["salary", "pay", "compensation", "wage"], + seeklocation: ["location", "where", "place"], + mbti: ["personality", "type", "profile"], + alias: ["id", "identifier", "username"], + }; + + if (commonMappings[fieldName]) { + synonyms.push(...commonMappings[fieldName]); + } + + return synonyms; + } + + mapNLToFields(nlTerms) { + const mappedFields = []; + + nlTerms.forEach((term) => { + const normalizedTerm = term.toLowerCase(); + const mapping = this.fieldMappings[normalizedTerm]; + + if (mapping) { + if (typeof mapping === "string") { + mappedFields.push(mapping); + } else if (mapping.field) { + mappedFields.push(mapping.field); + } + } + }); + + return [...new Set(mappedFields)]; // Remove duplicates + } + + getRecruiterReadableFields() { + if (!this.seekersSchema?.apxaccessrights?.recruiters?.R) { + // Fallback to basic fields + return ["alias", "email", "seekstatus", "seekworkingyear"]; + } + return this.seekersSchema.apxaccessrights.recruiters.R; + } + + getAllSeekersFields() { + if (!this.seekersSchema?.properties) return []; + return Object.keys(this.seekersSchema.properties); + } + + getAvailableIndexes() { + return Object.keys(this.indexMappings); + } + + getIndexByField(fieldName) { + const index = Object.values(this.indexMappings).find( + (idx) => idx.keyval === fieldName + ); + return index ? `idx.${index.name}` : null; + } +} + +// Initialize schema mapper +const schemaMapper = new SchemaMapper(SCHEMAS); + +const SEEKERS_FIELDS_FROM_SCHEMA = schemaMapper.getAllSeekersFields(); + +// ---- Minimal mapping config (for prompting + default fields) ---- const seekersMapping = { object: "seekers", - readableFieldsForRecruiters: [ - "alias", - "email", - "seekstatus", - "seekworkingyear", - "seekjobtitleexperience", - ], + defaultReadableFields: schemaMapper.getRecruiterReadableFields().slice(0, 5), // First 5 readable fields }; -// ---- Output contract (strict) ---- -const OdmdbQueryZ = z.object({ - object: z.literal("seekers"), - condition: z.array(z.string()), - fields: z.array(z.string()), // always an array -}); +// ---- JSON Schema for Structured Outputs (no zod, no oneOf) ---- +function buildResponseJsonSchema() { + const recruiterReadableFields = schemaMapper.getRecruiterReadableFields(); -// JSON Schema for Structured Output -const RESPONSE_JSON_SCHEMA = { - type: "object", - additionalProperties: false, - properties: { - object: { type: "string", enum: ["seekers"] }, - condition: { type: "array", items: { type: "string" } }, - fields: { type: "array", items: { type: "string" }, minItems: 1 }, - }, - required: ["object", "condition", "fields"], -}; + return { + type: "object", + additionalProperties: false, + properties: { + object: { type: "string", enum: ["seekers"] }, + condition: { type: "array", items: { type: "string" }, minItems: 1 }, + fields: { + type: "array", + items: { + type: "string", + enum: recruiterReadableFields, + }, + minItems: 1, + }, + }, + required: ["object", "condition", "fields"], + }; +} // ---- Prompt builders ---- function systemPrompt() { + const availableFields = schemaMapper.getAllSeekersFields(); + const recruiterReadableFields = schemaMapper.getRecruiterReadableFields(); + const availableIndexes = schemaMapper.getAvailableIndexes(); + return [ "You convert a natural language request into an ODMDB search payload.", - "Return ONLY a compact JSON object that matches the provided JSON Schema. The 'fields' property MUST be an array of strings.", + "Return ONLY a compact JSON object that matches the provided JSON Schema.", "", "ODMDB DSL:", "- join(remoteObject:localKey:remoteProp:operator:value)", - "- idx.(value)", - "- prop.(operator:value) with dates or scalars.", + "- idx.(value) - for indexed fields", + "- prop.(operator:value) - for direct property queries", + "", + "Available seekers fields:", + availableFields.slice(0, 15).join(", ") + + (availableFields.length > 15 ? "..." : ""), + "", + "Available indexes for optimization:", + availableIndexes.join(", "), + "", + "Recruiter-readable fields (use these for field selection):", + recruiterReadableFields.join(", "), + "", + "Field mappings for natural language:", + "- 'email' → email", + "- 'experience' → seekworkingyear", + "- 'job titles' → seekjobtitleexperience", + "- 'status' → seekstatus", + "- 'salary' → salaryexpectation", + "- 'location' → seeklocation", + "- 'new/recent' → dt_create (use prop.dt_create(>=:YYYY-MM-DD))", "", "Rules:", "- Object must be 'seekers'.", - "- For 'new'/'recent' recency, map to prop.dt_create with a resolved absolute date.", - "- For 'experience', map to seekworkingyear.", - "- Prefer recruiter-readable fields if a small set is requested. If the request is generic, return this default shortlist:", - seekersMapping.readableFieldsForRecruiters.join(", "), + "- Use indexes when possible (idx.seekstatus_alias for status queries)", + "- For date filters, use prop.dt_create with absolute dates", + "- Only return recruiter-readable fields in 'fields' array", + `- Default fields if request is generic: ${recruiterReadableFields + .slice(0, 5) + .join(", ")}`, "", - "Timezone is Europe/Paris. Today is 2025-10-13.", - "Interpret 'last week' as now minus 7 days → 2025-10-06.", - "", - "Schemas (context only, may be null):", - JSON.stringify(SCHEMAS, null, 2), + "Timezone is Europe/Paris. Today is 2025-10-14.", + "Interpret 'last week' as now minus 7 days → 2025-10-07.", + "Interpret 'yesterday' as → 2025-10-13.", ].join("\n"); } function userPrompt(nl) { @@ -101,11 +295,10 @@ async function inferQuery(nlText) { { role: "user", content: userPrompt(nlText) }, ], text: { - // <= new location for structured output format format: { name: "OdmdbQuery", type: "json_schema", - schema: RESPONSE_JSON_SCHEMA, + schema: buildResponseJsonSchema(), strict: true, }, }, @@ -119,34 +312,104 @@ async function inferQuery(nlText) { })(); const parsed = JSON.parse(jsonText); - const validated = OdmdbQueryZ.parse(parsed); + return parsed; +} - // Light safety check on DSL tokens - const allowed = ["join(", "idx.", "prop."]; - for (const c of validated.condition) { - const ok = allowed.some((t) => c.includes(t)); - const ascii = /^[\x09\x0A\x0D\x20-\x7E()_:\[\].,=> c.includes(t)); + const ascii = /^[\x09\x0A\x0D\x20-\x7E()_:\[\].,=> !SEEKERS_FIELDS_FROM_SCHEMA.includes(f) + ); + if (unknown.length) { + // Drop unknown but continue (PoC behavior) + console.warn( + "⚠️ Dropping unknown fields (not in seekers schema):", + unknown + ); + candidate.fields = candidate.fields.filter((f) => + SEEKERS_FIELDS_FROM_SCHEMA.includes(f) + ); + if (!candidate.fields.length) { + // If all dropped, fallback to default shortlist intersected with schema + const fallback = seekersMapping.defaultReadableFields.filter((f) => + SEEKERS_FIELDS_FROM_SCHEMA.includes(f) + ); + if (!fallback.length) + throw new Error( + "No valid fields remain after validation and no fallback available." + ); + candidate.fields = fallback; + } + } + } else { + // If we can't read the schema (main.json shape unknown), at least ensure strings & dedupe + candidate.fields = [ + ...new Set( + candidate.fields.filter((f) => typeof f === "string" && f.trim()) + ), + ]; + } + + return candidate; } // ---- Run PoC (print only the created query; do not execute) ---- (async () => { try { - if (!process.env.OPENAI_API_KEY) { + if (!process.env.OPENAI_API_KEY) throw new Error("Missing OPENAI_API_KEY env var."); - } const out = await inferQuery(NL_QUERY); + const validated = validateWithOdmdbSchema(out); - // Just output the created query (no execution) + // Output ONLY the created query (no execution) console.log( JSON.stringify( { - object: out.object, - condition: out.condition, - fields: out.fields, + object: validated.object, + condition: validated.condition, + fields: validated.fields, }, null, 2