Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Improve syntax detection heuristics
  • Loading branch information
adamziel committed Nov 28, 2025
commit 275199a69755a4713ba3316d34faa5391b054f8c
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@ import {
type ViewUpdate,
} from '@codemirror/view';
import { useCallback, useEffect, useRef, useState } from 'react';
import { detectLanguage, type SupportedLanguage } from './language-detection';
import {
inferLanguageFromBlueprint,
type SupportedLanguage,
} from './infer-language-from-blueprint';
import {
filterSchemaByDiscriminator,
getCurrentContainerType,
Expand Down Expand Up @@ -1128,7 +1131,7 @@ export function JSONSchemaEditor({
useState<StringEditorState>({
isOpen: false,
initialValue: '',
language: 'text',
language: 'plaintext',
contentStart: 0,
contentEnd: 0,
});
Expand All @@ -1146,7 +1149,7 @@ export function JSONSchemaEditor({
const parsedValue = tryParseJsonString(stringInfo.rawValue);
if (parsedValue === null) return false;

const language = detectLanguage(
const language = inferLanguageFromBlueprint(
stringInfo.path,
stringInfo.stepType,
parsedValue
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
/**
* Detect language from string content using simple heuristics
*/
export type SupportedLanguage =
| 'html'
| 'javascript'
| 'css'
| 'php'
| 'sql'
| 'markdown'
| 'plaintext';

interface LanguageRule {
pathPattern: RegExp;
language: SupportedLanguage;
}

/**
* Rules for detecting language based on JSON path.
* Each rule has a regex pattern that matches against the JSON path
* and the language to use when matched.
*/
const languageRules: LanguageRule[] = [
// runPHP step - code field contains PHP
{ pathPattern: /\.steps\[\d+\]\.code$/, language: 'php' },
// runSQL step - sql field contains SQL
{ pathPattern: /\.steps\[\d+\]\.sql$/, language: 'sql' },
];

/**
* Detect the appropriate language for a string at a given JSON path.
* Also considers the step type from the discriminator if available.
*
* @param jsonPath - The JSON path as an array of path segments (e.g., ['steps', '0', 'code'])
* @param stepType - Optional step type discriminator value (e.g., 'runPHP')
* @param content - Optional string content for heuristic detection
* @returns The detected language or 'text' if no specific language is detected
*/
export function inferLanguageFromBlueprint(
jsonPath: string[],
stepType?: string,
content?: string
): SupportedLanguage {
// Build a dot-notation path for matching
const pathString =
'.' +
jsonPath
.map((segment) => {
// Check if this segment looks like an array index
if (/^\d+$/.test(segment)) {
return `[${segment}]`;
}
return segment;
})
.join('.')
.replace(/\.\[/g, '[');

// First, try path-based detection
for (const rule of languageRules) {
if (rule.pathPattern.test(pathString)) {
return rule.language;
}
}

// Then, try step-type-based detection
// If we know the step type, use it to infer the language
if (stepType) {
if (stepType === 'runPHP' || stepType === 'runPHPWithOptions') {
if (jsonPath[jsonPath.length - 1] === 'code') {
return 'php';
}
}
if (stepType === 'runSQL') {
if (jsonPath[jsonPath.length - 1] === 'sql') {
return 'sql';
}
}
}

// Finally, try content-based heuristics
return inferLanguageFromContent(content || '');
}

export function inferLanguageFromContent(input: string): SupportedLanguage {
const text = input.trim();
if (!text) return 'plaintext';

const lower = text.toLowerCase();

const scores: Record<Exclude<SupportedLanguage, 'plaintext'>, number> = {
html: 0,
javascript: 0,
css: 0,
php: 0,
sql: 0,
markdown: 0,
};

const bump = (lang: keyof typeof scores, value = 1) => {
scores[lang] += value;
};

// --- Strong, almost definitive signals ---

// PHP
if (text.includes('<?php') || text.includes('<?= ')) {
bump('php', 10);
}
if (/\$\w+/.test(text)) bump('php', 2);
if (/\bnamespace\b|\buse\s+[\w\\]+;/.test(text)) bump('php', 2);
if (/\becho\b|\bvar_dump\s*\(/.test(lower)) bump('php', 1);

// HTML
if (/<!doctype\s+html>/i.test(text)) bump('html', 10);
if (/^<html[\s>]/i.test(text)) bump('html', 8);
if (/<head[\s>]/i.test(text) || /<body[\s>]/i.test(text)) bump('html', 6);
if (/<[a-z][^>]*>/.test(text)) bump('html', 3);
if (/<\/[a-z][^>]*>/.test(text)) bump('html', 3);
if (/\b<!--.*?-->/.test(text)) bump('html', 1);

// CSS
if (/\b@media\b|\b@keyframes\b|\b:root\b/.test(lower)) bump('css', 4);
if (/\.[a-z0-9_-]+\s*\{/.test(lower)) bump('css', 3);
if (/#[a-z0-9_-]+\s*\{/.test(lower)) bump('css', 3);
if (/[a-z-]+\s*:\s*[^;{}]+;/.test(lower)) bump('css', 2);
if (/\bdisplay\s*:\s*(flex|grid|block|inline)/.test(lower)) bump('css', 2);

// JavaScript
if (/\b(import|export)\s+[^;]+from\b/.test(text)) bump('javascript', 5);
if (/\b(async\s+)?function\b/.test(text)) bump('javascript', 3);
if (/\bconst\b|\blet\b|\bvar\b/.test(text)) bump('javascript', 2);
if (/=>/.test(text)) bump('javascript', 2);
if (/\bclass\s+\w+/.test(text)) bump('javascript', 1);
if (/\bconsole\.\w+\s*\(/.test(text)) bump('javascript', 2);
if (/\bdocument\.\w+|\bwindow\.\w+/.test(text)) bump('javascript', 2);
if (/;\s*$/.test(text)) bump('javascript', 1);

// SQL
if (/\bselect\b[\s\S]+\bfrom\b/i.test(text)) bump('sql', 6);
if (/\binsert\s+into\b/i.test(text)) bump('sql', 4);
if (/\bupdate\b[\s\S]+\bset\b/i.test(text)) bump('sql', 4);
if (/\bdelete\s+from\b/i.test(text)) bump('sql', 4);
if (/\bcreate\s+table\b/i.test(text)) bump('sql', 4);
if (/\binner\s+join\b|\bleft\s+join\b|\bright\s+join\b/i.test(text))
bump('sql', 3);
if (/\bwhere\b|\border\s+by\b|\bgroup\s+by\b/i.test(text)) bump('sql', 1);

// Markdown
const lines = text.split(/\r?\n/);

let mdHeadingCount = 0;
let mdListCount = 0;
let mdCodeFenceCount = 0;
let mdLinkCount = 0;
let mdQuoteCount = 0;

for (const line of lines) {
const trimmed = line.trim();

if (/^#{1,6}\s+\S/.test(trimmed)) mdHeadingCount++;
if (/^(\*|-|\+)\s+\S/.test(trimmed)) mdListCount++;
if (/^```/.test(trimmed)) mdCodeFenceCount++;
if (/^>\s+\S/.test(trimmed)) mdQuoteCount++;
}

if (/\[[^\]]+\]\([^)]+\)/.test(text)) mdLinkCount++;

if (mdHeadingCount > 0) bump('markdown', 2 + mdHeadingCount);
if (mdListCount > 0) bump('markdown', 1 + mdListCount);
if (mdCodeFenceCount > 0) bump('markdown', 3 + mdCodeFenceCount);
if (mdLinkCount > 0) bump('markdown', 3);
if (mdQuoteCount > 0) bump('markdown', 1 + mdQuoteCount);

// Markdown vs HTML conflict: if there are many angle brackets, tilt to HTML
const angleBrackets =
(text.match(/</g)?.length ?? 0) + (text.match(/>/g)?.length ?? 0);
if (angleBrackets > 5) bump('html', 3);

// CSS vs JS conflict: if there are many colons in property-like context, tilt to CSS
if (/[a-z-]+\s*:\s*[^;{}]+;/.test(lower) && /\{[\s\S]*\}/.test(text)) {
bump('css', 2);
}

// JS vs PHP: if PHP tags present, PHP wins anyway (already given big weight)

// Final decision
let bestLang: SupportedLanguage = 'plaintext';
let bestScore = 0;

for (const [lang, score] of Object.entries(scores)) {
if (score > bestScore) {
bestScore = score;
bestLang = lang as SupportedLanguage;
}
}

// Require minimal confidence
if (bestScore < 2) return 'plaintext';

return bestLang;
}

/**
* Get a human-readable label for a language
*/
export function getLanguageLabel(language: SupportedLanguage): string {
switch (language) {
case 'php':
return 'PHP';
case 'sql':
return 'SQL';
case 'html':
return 'HTML';
case 'markdown':
return 'Markdown';
case 'javascript':
return 'JavaScript';
case 'css':
return 'CSS';
case 'plaintext':
return 'Plain Text';
}
}

/**
* Get all available languages for the dropdown
*/
export function getAvailableLanguages(): {
value: SupportedLanguage;
label: string;
}[] {
return [
{ value: 'php', label: 'PHP' },
{ value: 'sql', label: 'SQL' },
{ value: 'html', label: 'HTML' },
{ value: 'markdown', label: 'Markdown' },
{ value: 'plaintext', label: 'Plain Text' },
];
}
Loading