You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

127 lines
3.9 KiB
JavaScript

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

// @flow
/*
* This file defines the Unicode scripts and script families that we
* support. To add new scripts or families, just add a new entry to the
* scriptData array below. Adding scripts to the scriptData array allows
* characters from that script to appear in \text{} environments.
*/
/**
* Each script or script family has a name and an array of blocks.
* Each block is an array of two numbers which specify the start and
* end points (inclusive) of a block of Unicode codepoints.
*/
type Script = {
name: string;
blocks: Array<Array<number>>;
};
/**
* Unicode block data for the families of scripts we support in \text{}.
* Scripts only need to appear here if they do not have font metrics.
*/
const scriptData: Array<Script> = [
{
// Latin characters beyond the Latin-1 characters we have metrics for.
// Needed for Czech, Hungarian and Turkish text, for example.
name: 'latin',
blocks: [
[0x0100, 0x024f], // Latin Extended-A and Latin Extended-B
[0x0300, 0x036f], // Combining Diacritical marks
],
},
{
// The Cyrillic script used by Russian and related languages.
// A Cyrillic subset used to be supported as explicitly defined
// symbols in symbols.js
name: 'cyrillic',
blocks: [[0x0400, 0x04ff]],
},
{
// Armenian
name: 'armenian',
blocks: [[0x0530, 0x058F]],
},
{
// The Brahmic scripts of South and Southeast Asia
// Devanagari (0900097F)
// Bengali (098009FF)
// Gurmukhi (0A000A7F)
// Gujarati (0A800AFF)
// Oriya (0B000B7F)
// Tamil (0B800BFF)
// Telugu (0C000C7F)
// Kannada (0C800CFF)
// Malayalam (0D000D7F)
// Sinhala (0D800DFF)
// Thai (0E000E7F)
// Lao (0E800EFF)
// Tibetan (0F000FFF)
// Myanmar (1000109F)
name: 'brahmic',
blocks: [[0x0900, 0x109F]],
},
{
name: 'georgian',
blocks: [[0x10A0, 0x10ff]],
},
{
// Chinese and Japanese.
// The "k" in cjk is for Korean, but we've separated Korean out
name: "cjk",
blocks: [
[0x3000, 0x30FF], // CJK symbols and punctuation, Hiragana, Katakana
[0x4E00, 0x9FAF], // CJK ideograms
[0xFF00, 0xFF60], // Fullwidth punctuation
// TODO: add halfwidth Katakana and Romanji glyphs
],
},
{
// Korean
name: 'hangul',
blocks: [[0xAC00, 0xD7AF]],
},
];
/**
* Given a codepoint, return the name of the script or script family
* it is from, or null if it is not part of a known block
*/
export function scriptFromCodepoint(codepoint: number): ?string {
for (let i = 0; i < scriptData.length; i++) {
const script = scriptData[i];
for (let i = 0; i < script.blocks.length; i++) {
const block = script.blocks[i];
if (codepoint >= block[0] && codepoint <= block[1]) {
return script.name;
}
}
}
return null;
}
/**
* A flattened version of all the supported blocks in a single array.
* This is an optimization to make supportedCodepoint() fast.
*/
const allBlocks: Array<number> = [];
scriptData.forEach(s => s.blocks.forEach(b => allBlocks.push(...b)));
/**
* Given a codepoint, return true if it falls within one of the
* scripts or script families defined above and false otherwise.
*
* Micro benchmarks shows that this is faster than
* /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60\uAC00-\uD7AF\u0900-\u109F]/.test()
* in Firefox, Chrome and Node.
*/
export function supportedCodepoint(codepoint: number): boolean {
for (let i = 0; i < allBlocks.length; i += 2) {
if (codepoint >= allBlocks[i] && codepoint <= allBlocks[i + 1]) {
return true;
}
}
return false;
}