Skip to content

Commit

Permalink
Better unicode identifier start / continue check (#15455)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jarred-Sumner authored Dec 26, 2024
1 parent d4c0432 commit 145a7fd
Show file tree
Hide file tree
Showing 8 changed files with 322 additions and 2,399 deletions.
172 changes: 0 additions & 172 deletions misctools/gen-unicode-table.js

This file was deleted.

108 changes: 108 additions & 0 deletions misctools/gen-unicode-table.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import { Generator, Context } from "./unicode-generator";

// Create sets for fast lookups
const idStartES5Set = new Set([
...require("@unicode/unicode-3.0.0/General_Category/Uppercase_Letter/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Lowercase_Letter/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Titlecase_Letter/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Modifier_Letter/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Other_Letter/code-points"),
]);

const idContinueES5Set = new Set([
...idStartES5Set,
...require("@unicode/unicode-3.0.0/General_Category/Nonspacing_Mark/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Spacing_Mark/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Decimal_Number/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Connector_Punctuation/code-points"),
]);

const idStartESNextSet = new Set(require("@unicode/unicode-15.1.0/Binary_Property/ID_Start/code-points"));
const idContinueESNextSet = new Set(require("@unicode/unicode-15.1.0/Binary_Property/ID_Continue/code-points"));

// Exclude known problematic codepoints
const ID_Continue_mistake = new Set([0x30fb, 0xff65]);

function bitsToU64Array(bits: number[]): bigint[] {
const result: bigint[] = [];
for (let i = 0; i < bits.length; i += 64) {
let value = 0n;
for (let j = 0; j < 64 && i + j < bits.length; j++) {
if (bits[i + j]) {
value |= 1n << BigInt(j);
}
}
result.push(value);
}
return result;
}

async function generateTable(table: string, name: string, checkFn: (cp: number) => boolean) {
const context: Context<boolean> = {
get: (cp: number) => checkFn(cp),
eql: (a: boolean, b: boolean) => a === b,
};

const generator = new Generator(context);
const tables = await generator.generate();

return `
pub fn ${name}(cp: u21) bool {
if (cp > 0x10FFFF) return false;
const high = cp >> 8;
const low = cp & 0xFF;
const stage2_idx = ${table}.stage1[high];
const bit_pos = stage2_idx + low;
const u64_idx = bit_pos >> 6;
const bit_idx = @as(u6, @intCast(bit_pos & 63));
return (${table}.stage2[u64_idx] & (@as(u64, 1) << bit_idx)) != 0;
}
const ${table} = struct {
pub const stage1 = [_]u16{${tables.stage1.join(",")}};
pub const stage2 = [_]u64{${bitsToU64Array(tables.stage2)
.map(n => n.toString())
.join(",")}};
};
`;
}

async function main() {
const functions = [
{
name: "isIDStartES5",
table: "idStartES5",
check: (cp: number) => idStartES5Set.has(cp),
},
{
name: "isIDContinueES5",
table: "idContinueES5",
check: (cp: number) => idContinueES5Set.has(cp),
},
{
name: "isIDStartESNext",
table: "idStartESNext",
check: (cp: number) => idStartESNextSet.has(cp),
},
{
name: "isIDContinueESNext",
table: "idContinueESNext",
check: (cp: number) => idContinueESNextSet.has(cp) && !ID_Continue_mistake.has(cp),
},
];

const results = await Promise.all(
functions.map(async ({ name, check, table }) => {
const code = await generateTable(table, name, check);
return `
/// ${name} checks if a codepoint is valid in the ${name} category
${code}`;
}),
);

console.log(`/// This file is auto-generated. Do not edit.
${results.join("\n\n")}`);
}

main();
5 changes: 4 additions & 1 deletion misctools/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
"license": "MIT",
"devDependencies": {
"@unicode/unicode-13.0.0": "^1.2.1",
"@unicode/unicode-3.0.0": "^1.2.1",
"@unicode/unicode-3.0.0": "^1.6.5",
"semver": "^7.3.7"
},
"dependencies": {
"@unicode/unicode-15.1.0": "^1.6.5"
}
}
Loading

0 comments on commit 145a7fd

Please sign in to comment.