[WIP] remove zod and use actual schema

This commit is contained in:
Eliyan
2025-10-14 11:18:47 +02:00
parent 6dbfe5cb07
commit 0be31357cf
2 changed files with 327 additions and 67 deletions

View File

@@ -22,6 +22,7 @@ This is a **Proof of Concept (PoC)** that demonstrates the conversion of natural
## Installation
1. Install dependencies:
```bash
npm install
```
@@ -63,14 +64,8 @@ The PoC generates ODMDB queries in this format:
```json
{
"object": "seekers",
"condition": [
"prop.dt_create(>=:2025-10-06)"
],
"fields": [
"alias",
"email",
"seekworkingyear"
]
"condition": ["prop.dt_create(>=:2025-10-06)"],
"fields": ["alias", "email", "seekworkingyear"]
}
```
@@ -85,6 +80,7 @@ The PoC understands and generates these ODMDB DSL patterns:
## Field Mappings
Currently supports mapping for seekers object:
- `email` → `email`
- `experience` → `seekworkingyear`
- `job titles` → `seekjobtitleexperience`
@@ -93,6 +89,7 @@ Currently supports mapping for seekers object:
## Schema Context
The PoC can optionally load schema files for context:
- `main.json` - Combined schema definitions
- `lg.json` - Localization/language mappings

367
poc.js
View File

@@ -1,22 +1,21 @@
// PoC: NL → ODMDB query (seekers)
// PoC: NL → ODMDB query (seekers), no zod — validate via ODMDB schema
// Usage:
// 1) export OPENAI_API_KEY=sk-...
// 2) node poc.js
import fs from "node:fs";
import OpenAI from "openai";
import { z } from "zod";
// ---- Config ----
const MODEL = process.env.OPENAI_MODEL || "gpt-5";
const MAIN_SCHEMA_PATH = "./main.json"; // optional context; safe if missing
const LG_SCHEMA_PATH = "./lg.json"; // optional context; safe if missing
const MAIN_SCHEMA_PATH = "./main.json"; // optional context; used for validation
const LG_SCHEMA_PATH = "./lg.json"; // optional context
// Hardcoded NL query for the PoC (no multi-turn)
const NL_QUERY =
"give me new seekers since last week with email and experience";
// ---- Load schemas if present (not required for output) ----
// ---- Load schemas (safe) ----
function loadJsonSafe(path) {
try {
if (fs.existsSync(path)) {
@@ -30,60 +29,255 @@ const SCHEMAS = {
lg: loadJsonSafe(LG_SCHEMA_PATH),
};
// ---- Seekers mapping (from our agreement) ----
// ---- Helpers to read seekers field names from your ODMDB custom schema ----
function extractSeekersPropsFromOdmdbSchema(main) {
if (!main) return [];
// Try common shapes
// 1) { objects: { seekers: { properties: {...} } } }
if (
main.objects?.seekers?.properties &&
typeof main.objects.seekers.properties === "object"
) {
return Object.keys(main.objects.seekers.properties);
}
// 2) If main is an array, search for an item that looks like seekers schema
if (Array.isArray(main)) {
for (const entry of main) {
const keys = extractSeekersPropsFromOdmdbSchema(entry);
if (keys.length) return keys;
}
}
// 3) Fallback: deep search for a { seekers: { properties: {...} } } node
try {
const stack = [main];
while (stack.length) {
const node = stack.pop();
if (node && typeof node === "object") {
if (
node.seekers?.properties &&
typeof node.seekers.properties === "object"
) {
return Object.keys(node.seekers.properties);
}
for (const v of Object.values(node)) {
if (v && typeof v === "object") stack.push(v);
}
}
}
} catch {}
return [];
}
// ---- Schema-based mapping system ----
class SchemaMapper {
constructor(schemas) {
this.schemas = schemas.main || [];
this.seekersSchema = this.findSchemaByType("seekers");
this.fieldMappings = this.buildFieldMappings();
this.indexMappings = this.buildIndexMappings();
}
findSchemaByType(objectType) {
if (!this.schemas || !Array.isArray(this.schemas)) return null;
return this.schemas.find(
(schema) => schema.$id && schema.$id.includes(`/${objectType}`)
);
}
buildFieldMappings() {
if (!this.seekersSchema) return {};
const mappings = {};
const properties = this.seekersSchema.properties || {};
Object.entries(properties).forEach(([fieldName, fieldDef]) => {
const synonyms = this.generateSynonyms(fieldName, fieldDef);
mappings[fieldName] = {
field: fieldName,
title: fieldDef.title?.toLowerCase(),
description: fieldDef.description?.toLowerCase(),
type: fieldDef.type,
synonyms,
};
// Index by title and synonyms
if (fieldDef.title) {
mappings[fieldDef.title.toLowerCase()] = fieldName;
}
synonyms.forEach((synonym) => {
mappings[synonym.toLowerCase()] = fieldName;
});
});
return mappings;
}
buildIndexMappings() {
if (!this.seekersSchema?.apxidx) return {};
const indexes = {};
this.seekersSchema.apxidx.forEach((idx) => {
indexes[idx.name] = {
name: idx.name,
type: idx.type,
keyval: idx.keyval,
};
});
return indexes;
}
generateSynonyms(fieldName, fieldDef) {
const synonyms = [];
// Common mappings based on actual schema
const commonMappings = {
email: ["contact", "mail", "contact email"],
seekworkingyear: ["experience", "years of experience", "work experience"],
seekjobtitleexperience: ["job titles", "job experience", "positions"],
seekstatus: ["status", "availability", "looking"],
dt_create: ["created", "creation date", "new", "recent", "since"],
salaryexpectation: ["salary", "pay", "compensation", "wage"],
seeklocation: ["location", "where", "place"],
mbti: ["personality", "type", "profile"],
alias: ["id", "identifier", "username"],
};
if (commonMappings[fieldName]) {
synonyms.push(...commonMappings[fieldName]);
}
return synonyms;
}
mapNLToFields(nlTerms) {
const mappedFields = [];
nlTerms.forEach((term) => {
const normalizedTerm = term.toLowerCase();
const mapping = this.fieldMappings[normalizedTerm];
if (mapping) {
if (typeof mapping === "string") {
mappedFields.push(mapping);
} else if (mapping.field) {
mappedFields.push(mapping.field);
}
}
});
return [...new Set(mappedFields)]; // Remove duplicates
}
getRecruiterReadableFields() {
if (!this.seekersSchema?.apxaccessrights?.recruiters?.R) {
// Fallback to basic fields
return ["alias", "email", "seekstatus", "seekworkingyear"];
}
return this.seekersSchema.apxaccessrights.recruiters.R;
}
getAllSeekersFields() {
if (!this.seekersSchema?.properties) return [];
return Object.keys(this.seekersSchema.properties);
}
getAvailableIndexes() {
return Object.keys(this.indexMappings);
}
getIndexByField(fieldName) {
const index = Object.values(this.indexMappings).find(
(idx) => idx.keyval === fieldName
);
return index ? `idx.${index.name}` : null;
}
}
// Initialize schema mapper
const schemaMapper = new SchemaMapper(SCHEMAS);
const SEEKERS_FIELDS_FROM_SCHEMA = schemaMapper.getAllSeekersFields();
// ---- Minimal mapping config (for prompting + default fields) ----
const seekersMapping = {
object: "seekers",
readableFieldsForRecruiters: [
"alias",
"email",
"seekstatus",
"seekworkingyear",
"seekjobtitleexperience",
],
defaultReadableFields: schemaMapper.getRecruiterReadableFields().slice(0, 5), // First 5 readable fields
};
// ---- Output contract (strict) ----
const OdmdbQueryZ = z.object({
object: z.literal("seekers"),
condition: z.array(z.string()),
fields: z.array(z.string()), // always an array
});
// ---- JSON Schema for Structured Outputs (no zod, no oneOf) ----
function buildResponseJsonSchema() {
const recruiterReadableFields = schemaMapper.getRecruiterReadableFields();
// JSON Schema for Structured Output
const RESPONSE_JSON_SCHEMA = {
return {
type: "object",
additionalProperties: false,
properties: {
object: { type: "string", enum: ["seekers"] },
condition: { type: "array", items: { type: "string" } },
fields: { type: "array", items: { type: "string" }, minItems: 1 },
condition: { type: "array", items: { type: "string" }, minItems: 1 },
fields: {
type: "array",
items: {
type: "string",
enum: recruiterReadableFields,
},
minItems: 1,
},
},
required: ["object", "condition", "fields"],
};
};
}
// ---- Prompt builders ----
function systemPrompt() {
const availableFields = schemaMapper.getAllSeekersFields();
const recruiterReadableFields = schemaMapper.getRecruiterReadableFields();
const availableIndexes = schemaMapper.getAvailableIndexes();
return [
"You convert a natural language request into an ODMDB search payload.",
"Return ONLY a compact JSON object that matches the provided JSON Schema. The 'fields' property MUST be an array of strings.",
"Return ONLY a compact JSON object that matches the provided JSON Schema.",
"",
"ODMDB DSL:",
"- join(remoteObject:localKey:remoteProp:operator:value)",
"- idx.<indexName>(value)",
"- prop.<field>(operator:value) with dates or scalars.",
"- idx.<indexName>(value) - for indexed fields",
"- prop.<field>(operator:value) - for direct property queries",
"",
"Available seekers fields:",
availableFields.slice(0, 15).join(", ") +
(availableFields.length > 15 ? "..." : ""),
"",
"Available indexes for optimization:",
availableIndexes.join(", "),
"",
"Recruiter-readable fields (use these for field selection):",
recruiterReadableFields.join(", "),
"",
"Field mappings for natural language:",
"- 'email' → email",
"- 'experience' → seekworkingyear",
"- 'job titles' → seekjobtitleexperience",
"- 'status' → seekstatus",
"- 'salary' → salaryexpectation",
"- 'location' → seeklocation",
"- 'new/recent' → dt_create (use prop.dt_create(>=:YYYY-MM-DD))",
"",
"Rules:",
"- Object must be 'seekers'.",
"- For 'new'/'recent' recency, map to prop.dt_create with a resolved absolute date.",
"- For 'experience', map to seekworkingyear.",
"- Prefer recruiter-readable fields if a small set is requested. If the request is generic, return this default shortlist:",
seekersMapping.readableFieldsForRecruiters.join(", "),
"- Use indexes when possible (idx.seekstatus_alias for status queries)",
"- For date filters, use prop.dt_create with absolute dates",
"- Only return recruiter-readable fields in 'fields' array",
`- Default fields if request is generic: ${recruiterReadableFields
.slice(0, 5)
.join(", ")}`,
"",
"Timezone is Europe/Paris. Today is 2025-10-13.",
"Interpret 'last week' as now minus 7 days → 2025-10-06.",
"",
"Schemas (context only, may be null):",
JSON.stringify(SCHEMAS, null, 2),
"Timezone is Europe/Paris. Today is 2025-10-14.",
"Interpret 'last week' as now minus 7 days → 2025-10-07.",
"Interpret 'yesterday' as → 2025-10-13.",
].join("\n");
}
function userPrompt(nl) {
@@ -101,11 +295,10 @@ async function inferQuery(nlText) {
{ role: "user", content: userPrompt(nlText) },
],
text: {
// <= new location for structured output format
format: {
name: "OdmdbQuery",
type: "json_schema",
schema: RESPONSE_JSON_SCHEMA,
schema: buildResponseJsonSchema(),
strict: true,
},
},
@@ -119,34 +312,104 @@ async function inferQuery(nlText) {
})();
const parsed = JSON.parse(jsonText);
const validated = OdmdbQueryZ.parse(parsed);
return parsed;
}
// Light safety check on DSL tokens
const allowed = ["join(", "idx.", "prop."];
for (const c of validated.condition) {
const ok = allowed.some((t) => c.includes(t));
const ascii = /^[\x09\x0A\x0D\x20-\x7E()_:\[\].,=><!'"-]+$/.test(c);
if (!ok || !ascii) throw new Error(`Malformed condition: ${c}`);
// ---- Validate using the ODMDB schema (not zod) ----
function validateWithOdmdbSchema(candidate) {
// Basic shape checks (already enforced by Structured Outputs, but keep defensive)
if (!candidate || typeof candidate !== "object")
throw new Error("Invalid response (not an object).");
if (candidate.object !== "seekers")
throw new Error("Invalid object; must be 'seekers'.");
if (!Array.isArray(candidate.condition) || candidate.condition.length === 0) {
throw new Error(
"Invalid 'condition'; must be a non-empty array of strings."
);
}
return validated;
if (!Array.isArray(candidate.fields) || candidate.fields.length === 0) {
throw new Error("Invalid 'fields'; must be a non-empty array of strings.");
}
// Validate fields against schema
const availableFields = schemaMapper.getAllSeekersFields();
const recruiterReadableFields = schemaMapper.getRecruiterReadableFields();
for (const field of candidate.fields) {
if (!availableFields.includes(field)) {
throw new Error(`Invalid field '${field}'; not found in seekers schema.`);
}
if (!recruiterReadableFields.includes(field)) {
console.warn(
`Warning: Field '${field}' may not be readable by recruiters.`
);
}
}
// DSL token sanity
const allowedTokens = ["join(", "idx.", "prop."];
for (const c of candidate.condition) {
if (typeof c !== "string")
throw new Error("Condition entries must be strings.");
const tokenOK = allowedTokens.some((t) => c.includes(t));
const ascii = /^[\x09\x0A\x0D\x20-\x7E()_:\[\].,=><!'"-]+$/.test(c);
if (!tokenOK || !ascii) throw new Error(`Malformed condition: ${c}`);
}
// Field existence check against ODMDB custom schema (seekers properties)
if (SEEKERS_FIELDS_FROM_SCHEMA.length) {
const unknown = candidate.fields.filter(
(f) => !SEEKERS_FIELDS_FROM_SCHEMA.includes(f)
);
if (unknown.length) {
// Drop unknown but continue (PoC behavior)
console.warn(
"⚠️ Dropping unknown fields (not in seekers schema):",
unknown
);
candidate.fields = candidate.fields.filter((f) =>
SEEKERS_FIELDS_FROM_SCHEMA.includes(f)
);
if (!candidate.fields.length) {
// If all dropped, fallback to default shortlist intersected with schema
const fallback = seekersMapping.defaultReadableFields.filter((f) =>
SEEKERS_FIELDS_FROM_SCHEMA.includes(f)
);
if (!fallback.length)
throw new Error(
"No valid fields remain after validation and no fallback available."
);
candidate.fields = fallback;
}
}
} else {
// If we can't read the schema (main.json shape unknown), at least ensure strings & dedupe
candidate.fields = [
...new Set(
candidate.fields.filter((f) => typeof f === "string" && f.trim())
),
];
}
return candidate;
}
// ---- Run PoC (print only the created query; do not execute) ----
(async () => {
try {
if (!process.env.OPENAI_API_KEY) {
if (!process.env.OPENAI_API_KEY)
throw new Error("Missing OPENAI_API_KEY env var.");
}
const out = await inferQuery(NL_QUERY);
const validated = validateWithOdmdbSchema(out);
// Just output the created query (no execution)
// Output ONLY the created query (no execution)
console.log(
JSON.stringify(
{
object: out.object,
condition: out.condition,
fields: out.fields,
object: validated.object,
condition: validated.condition,
fields: validated.fields,
},
null,
2