|
|
// @flow
|
|
|
|
|
|
/*
|
|
|
* This file defines the Unicode scripts and script families that we
|
|
|
* support. To add new scripts or families, just add a new entry to the
|
|
|
* scriptData array below. Adding scripts to the scriptData array allows
|
|
|
* characters from that script to appear in \text{} environments.
|
|
|
*/
|
|
|
|
|
|
/**
|
|
|
* Each script or script family has a name and an array of blocks.
|
|
|
* Each block is an array of two numbers which specify the start and
|
|
|
* end points (inclusive) of a block of Unicode codepoints.
|
|
|
*/
|
|
|
type Script = {
|
|
|
name: string;
|
|
|
blocks: Array<Array<number>>;
|
|
|
};
|
|
|
|
|
|
/**
|
|
|
* Unicode block data for the families of scripts we support in \text{}.
|
|
|
* Scripts only need to appear here if they do not have font metrics.
|
|
|
*/
|
|
|
const scriptData: Array<Script> = [
|
|
|
{
|
|
|
// Latin characters beyond the Latin-1 characters we have metrics for.
|
|
|
// Needed for Czech, Hungarian and Turkish text, for example.
|
|
|
name: 'latin',
|
|
|
blocks: [
|
|
|
[0x0100, 0x024f], // Latin Extended-A and Latin Extended-B
|
|
|
[0x0300, 0x036f], // Combining Diacritical marks
|
|
|
],
|
|
|
},
|
|
|
{
|
|
|
// The Cyrillic script used by Russian and related languages.
|
|
|
// A Cyrillic subset used to be supported as explicitly defined
|
|
|
// symbols in symbols.js
|
|
|
name: 'cyrillic',
|
|
|
blocks: [[0x0400, 0x04ff]],
|
|
|
},
|
|
|
{
|
|
|
// Armenian
|
|
|
name: 'armenian',
|
|
|
blocks: [[0x0530, 0x058F]],
|
|
|
},
|
|
|
{
|
|
|
// The Brahmic scripts of South and Southeast Asia
|
|
|
// Devanagari (0900–097F)
|
|
|
// Bengali (0980–09FF)
|
|
|
// Gurmukhi (0A00–0A7F)
|
|
|
// Gujarati (0A80–0AFF)
|
|
|
// Oriya (0B00–0B7F)
|
|
|
// Tamil (0B80–0BFF)
|
|
|
// Telugu (0C00–0C7F)
|
|
|
// Kannada (0C80–0CFF)
|
|
|
// Malayalam (0D00–0D7F)
|
|
|
// Sinhala (0D80–0DFF)
|
|
|
// Thai (0E00–0E7F)
|
|
|
// Lao (0E80–0EFF)
|
|
|
// Tibetan (0F00–0FFF)
|
|
|
// Myanmar (1000–109F)
|
|
|
name: 'brahmic',
|
|
|
blocks: [[0x0900, 0x109F]],
|
|
|
},
|
|
|
{
|
|
|
name: 'georgian',
|
|
|
blocks: [[0x10A0, 0x10ff]],
|
|
|
},
|
|
|
{
|
|
|
// Chinese and Japanese.
|
|
|
// The "k" in cjk is for Korean, but we've separated Korean out
|
|
|
name: "cjk",
|
|
|
blocks: [
|
|
|
[0x3000, 0x30FF], // CJK symbols and punctuation, Hiragana, Katakana
|
|
|
[0x4E00, 0x9FAF], // CJK ideograms
|
|
|
[0xFF00, 0xFF60], // Fullwidth punctuation
|
|
|
// TODO: add halfwidth Katakana and Romanji glyphs
|
|
|
],
|
|
|
},
|
|
|
{
|
|
|
// Korean
|
|
|
name: 'hangul',
|
|
|
blocks: [[0xAC00, 0xD7AF]],
|
|
|
},
|
|
|
];
|
|
|
|
|
|
/**
|
|
|
* Given a codepoint, return the name of the script or script family
|
|
|
* it is from, or null if it is not part of a known block
|
|
|
*/
|
|
|
export function scriptFromCodepoint(codepoint: number): ?string {
|
|
|
for (let i = 0; i < scriptData.length; i++) {
|
|
|
const script = scriptData[i];
|
|
|
for (let i = 0; i < script.blocks.length; i++) {
|
|
|
const block = script.blocks[i];
|
|
|
if (codepoint >= block[0] && codepoint <= block[1]) {
|
|
|
return script.name;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* A flattened version of all the supported blocks in a single array.
|
|
|
* This is an optimization to make supportedCodepoint() fast.
|
|
|
*/
|
|
|
const allBlocks: Array<number> = [];
|
|
|
scriptData.forEach(s => s.blocks.forEach(b => allBlocks.push(...b)));
|
|
|
|
|
|
/**
|
|
|
* Given a codepoint, return true if it falls within one of the
|
|
|
* scripts or script families defined above and false otherwise.
|
|
|
*
|
|
|
* Micro benchmarks shows that this is faster than
|
|
|
* /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60\uAC00-\uD7AF\u0900-\u109F]/.test()
|
|
|
* in Firefox, Chrome and Node.
|
|
|
*/
|
|
|
export function supportedCodepoint(codepoint: number): boolean {
|
|
|
for (let i = 0; i < allBlocks.length; i += 2) {
|
|
|
if (codepoint >= allBlocks[i] && codepoint <= allBlocks[i + 1]) {
|
|
|
return true;
|
|
|
}
|
|
|
}
|
|
|
return false;
|
|
|
}
|