You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1029 lines
37 KiB
JavaScript
1029 lines
37 KiB
JavaScript
// @flow
|
|
/* eslint no-constant-condition:0 */
|
|
import functions from "./functions";
|
|
import MacroExpander, {implicitCommands} from "./MacroExpander";
|
|
import symbols, {ATOMS, extraLatin} from "./symbols";
|
|
import {validUnit} from "./units";
|
|
import {supportedCodepoint} from "./unicodeScripts";
|
|
import ParseError from "./ParseError";
|
|
import {combiningDiacriticalMarksEndRegex} from "./Lexer";
|
|
import Settings from "./Settings";
|
|
import SourceLocation from "./SourceLocation";
|
|
import {uSubsAndSups, unicodeSubRegEx} from "./unicodeSupOrSub";
|
|
import {Token} from "./Token";
|
|
|
|
// Pre-evaluate both modules as unicodeSymbols require String.normalize()
|
|
import unicodeAccents from /*preval*/ "./unicodeAccents";
|
|
import unicodeSymbols from /*preval*/ "./unicodeSymbols";
|
|
|
|
import type {ParseNode, AnyParseNode, SymbolParseNode, UnsupportedCmdParseNode}
|
|
from "./parseNode";
|
|
import type {Atom, Group} from "./symbols";
|
|
import type {Mode, ArgType, BreakToken} from "./types";
|
|
import type {FunctionContext, FunctionSpec} from "./defineFunction";
|
|
import type {EnvSpec} from "./defineEnvironment";
|
|
|
|
/**
|
|
* This file contains the parser used to parse out a TeX expression from the
|
|
* input. Since TeX isn't context-free, standard parsers don't work particularly
|
|
* well.
|
|
*
|
|
* The strategy of this parser is as such:
|
|
*
|
|
* The main functions (the `.parse...` ones) take a position in the current
|
|
* parse string to parse tokens from. The lexer (found in Lexer.js, stored at
|
|
* this.gullet.lexer) also supports pulling out tokens at arbitrary places. When
|
|
* individual tokens are needed at a position, the lexer is called to pull out a
|
|
* token, which is then used.
|
|
*
|
|
* The parser has a property called "mode" indicating the mode that
|
|
* the parser is currently in. Currently it has to be one of "math" or
|
|
* "text", which denotes whether the current environment is a math-y
|
|
* one or a text-y one (e.g. inside \text). Currently, this serves to
|
|
* limit the functions which can be used in text mode.
|
|
*
|
|
* The main functions then return an object which contains the useful data that
|
|
* was parsed at its given point, and a new position at the end of the parsed
|
|
* data. The main functions can call each other and continue the parsing by
|
|
* using the returned position as a new starting point.
|
|
*
|
|
* There are also extra `.handle...` functions, which pull out some reused
|
|
* functionality into self-contained functions.
|
|
*
|
|
* The functions return ParseNodes.
|
|
*/
|
|
|
|
export default class Parser {
|
|
mode: Mode;
|
|
gullet: MacroExpander;
|
|
settings: Settings;
|
|
leftrightDepth: number;
|
|
nextToken: ?Token;
|
|
|
|
constructor(input: string, settings: Settings) {
|
|
// Start in math mode
|
|
this.mode = "math";
|
|
// Create a new macro expander (gullet) and (indirectly via that) also a
|
|
// new lexer (mouth) for this parser (stomach, in the language of TeX)
|
|
this.gullet = new MacroExpander(input, settings, this.mode);
|
|
// Store the settings for use in parsing
|
|
this.settings = settings;
|
|
// Count leftright depth (for \middle errors)
|
|
this.leftrightDepth = 0;
|
|
}
|
|
|
|
/**
|
|
* Checks a result to make sure it has the right type, and throws an
|
|
* appropriate error otherwise.
|
|
*/
|
|
expect(text: string, consume?: boolean = true) {
|
|
if (this.fetch().text !== text) {
|
|
throw new ParseError(
|
|
`Expected '${text}', got '${this.fetch().text}'`, this.fetch()
|
|
);
|
|
}
|
|
if (consume) {
|
|
this.consume();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Discards the current lookahead token, considering it consumed.
|
|
*/
|
|
consume() {
|
|
this.nextToken = null;
|
|
}
|
|
|
|
/**
|
|
* Return the current lookahead token, or if there isn't one (at the
|
|
* beginning, or if the previous lookahead token was consume()d),
|
|
* fetch the next token as the new lookahead token and return it.
|
|
*/
|
|
fetch(): Token {
|
|
if (this.nextToken == null) {
|
|
this.nextToken = this.gullet.expandNextToken();
|
|
}
|
|
return this.nextToken;
|
|
}
|
|
|
|
/**
|
|
* Switches between "text" and "math" modes.
|
|
*/
|
|
switchMode(newMode: Mode) {
|
|
this.mode = newMode;
|
|
this.gullet.switchMode(newMode);
|
|
}
|
|
|
|
/**
|
|
* Main parsing function, which parses an entire input.
|
|
*/
|
|
parse(): AnyParseNode[] {
|
|
if (!this.settings.globalGroup) {
|
|
// Create a group namespace for the math expression.
|
|
// (LaTeX creates a new group for every $...$, $$...$$, \[...\].)
|
|
this.gullet.beginGroup();
|
|
}
|
|
|
|
// Use old \color behavior (same as LaTeX's \textcolor) if requested.
|
|
// We do this within the group for the math expression, so it doesn't
|
|
// pollute settings.macros.
|
|
if (this.settings.colorIsTextColor) {
|
|
this.gullet.macros.set("\\color", "\\textcolor");
|
|
}
|
|
|
|
try {
|
|
// Try to parse the input
|
|
const parse = this.parseExpression(false);
|
|
|
|
// If we succeeded, make sure there's an EOF at the end
|
|
this.expect("EOF");
|
|
|
|
// End the group namespace for the expression
|
|
if (!this.settings.globalGroup) {
|
|
this.gullet.endGroup();
|
|
}
|
|
|
|
return parse;
|
|
|
|
// Close any leftover groups in case of a parse error.
|
|
} finally {
|
|
this.gullet.endGroups();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Fully parse a separate sequence of tokens as a separate job.
|
|
* Tokens should be specified in reverse order, as in a MacroDefinition.
|
|
*/
|
|
subparse(tokens: Token[]): AnyParseNode[] {
|
|
// Save the next token from the current job.
|
|
const oldToken = this.nextToken;
|
|
this.consume();
|
|
|
|
// Run the new job, terminating it with an excess '}'
|
|
this.gullet.pushToken(new Token("}"));
|
|
this.gullet.pushTokens(tokens);
|
|
const parse = this.parseExpression(false);
|
|
this.expect("}");
|
|
|
|
// Restore the next token from the current job.
|
|
this.nextToken = oldToken;
|
|
|
|
return parse;
|
|
}
|
|
|
|
static endOfExpression: string[] = ["}", "\\endgroup", "\\end", "\\right", "&"];
|
|
|
|
/**
|
|
* Parses an "expression", which is a list of atoms.
|
|
*
|
|
* `breakOnInfix`: Should the parsing stop when we hit infix nodes? This
|
|
* happens when functions have higher precedence han infix
|
|
* nodes in implicit parses.
|
|
*
|
|
* `breakOnTokenText`: The text of the token that the expression should end
|
|
* with, or `null` if something else should end the
|
|
* expression.
|
|
*/
|
|
parseExpression(
|
|
breakOnInfix: boolean,
|
|
breakOnTokenText?: BreakToken,
|
|
): AnyParseNode[] {
|
|
const body = [];
|
|
// Keep adding atoms to the body until we can't parse any more atoms (either
|
|
// we reached the end, a }, or a \right)
|
|
while (true) {
|
|
// Ignore spaces in math mode
|
|
if (this.mode === "math") {
|
|
this.consumeSpaces();
|
|
}
|
|
const lex = this.fetch();
|
|
if (Parser.endOfExpression.indexOf(lex.text) !== -1) {
|
|
break;
|
|
}
|
|
if (breakOnTokenText && lex.text === breakOnTokenText) {
|
|
break;
|
|
}
|
|
if (breakOnInfix && functions[lex.text] && functions[lex.text].infix) {
|
|
break;
|
|
}
|
|
const atom = this.parseAtom(breakOnTokenText);
|
|
if (!atom) {
|
|
break;
|
|
} else if (atom.type === "internal") {
|
|
continue;
|
|
}
|
|
body.push(atom);
|
|
}
|
|
if (this.mode === "text") {
|
|
this.formLigatures(body);
|
|
}
|
|
return this.handleInfixNodes(body);
|
|
}
|
|
|
|
/**
|
|
* Rewrites infix operators such as \over with corresponding commands such
|
|
* as \frac.
|
|
*
|
|
* There can only be one infix operator per group. If there's more than one
|
|
* then the expression is ambiguous. This can be resolved by adding {}.
|
|
*/
|
|
handleInfixNodes(body: AnyParseNode[]): AnyParseNode[] {
|
|
let overIndex = -1;
|
|
let funcName;
|
|
|
|
for (let i = 0; i < body.length; i++) {
|
|
if (body[i].type === "infix") {
|
|
if (overIndex !== -1) {
|
|
throw new ParseError(
|
|
"only one infix operator per group",
|
|
body[i].token);
|
|
}
|
|
overIndex = i;
|
|
funcName = body[i].replaceWith;
|
|
}
|
|
}
|
|
|
|
if (overIndex !== -1 && funcName) {
|
|
let numerNode;
|
|
let denomNode;
|
|
|
|
const numerBody = body.slice(0, overIndex);
|
|
const denomBody = body.slice(overIndex + 1);
|
|
|
|
if (numerBody.length === 1 && numerBody[0].type === "ordgroup") {
|
|
numerNode = numerBody[0];
|
|
} else {
|
|
numerNode = {type: "ordgroup", mode: this.mode, body: numerBody};
|
|
}
|
|
|
|
if (denomBody.length === 1 && denomBody[0].type === "ordgroup") {
|
|
denomNode = denomBody[0];
|
|
} else {
|
|
denomNode = {type: "ordgroup", mode: this.mode, body: denomBody};
|
|
}
|
|
|
|
let node;
|
|
if (funcName === "\\\\abovefrac") {
|
|
node = this.callFunction(funcName,
|
|
[numerNode, body[overIndex], denomNode], []);
|
|
} else {
|
|
node = this.callFunction(funcName, [numerNode, denomNode], []);
|
|
}
|
|
return [node];
|
|
} else {
|
|
return body;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handle a subscript or superscript with nice errors.
|
|
*/
|
|
handleSupSubscript(
|
|
name: string, // For error reporting.
|
|
): AnyParseNode {
|
|
const symbolToken = this.fetch();
|
|
const symbol = symbolToken.text;
|
|
this.consume();
|
|
this.consumeSpaces(); // ignore spaces before sup/subscript argument
|
|
const group = this.parseGroup(name);
|
|
|
|
if (!group) {
|
|
throw new ParseError(
|
|
"Expected group after '" + symbol + "'",
|
|
symbolToken
|
|
);
|
|
}
|
|
|
|
return group;
|
|
}
|
|
|
|
/**
|
|
* Converts the textual input of an unsupported command into a text node
|
|
* contained within a color node whose color is determined by errorColor
|
|
*/
|
|
formatUnsupportedCmd(text: string): UnsupportedCmdParseNode {
|
|
const textordArray = [];
|
|
|
|
for (let i = 0; i < text.length; i++) {
|
|
textordArray.push({type: "textord", mode: "text", text: text[i]});
|
|
}
|
|
|
|
const textNode = {
|
|
type: "text",
|
|
mode: this.mode,
|
|
body: textordArray,
|
|
};
|
|
|
|
const colorNode = {
|
|
type: "color",
|
|
mode: this.mode,
|
|
color: this.settings.errorColor,
|
|
body: [textNode],
|
|
};
|
|
|
|
return colorNode;
|
|
}
|
|
|
|
/**
|
|
* Parses a group with optional super/subscripts.
|
|
*/
|
|
parseAtom(breakOnTokenText?: BreakToken): ?AnyParseNode {
|
|
// The body of an atom is an implicit group, so that things like
|
|
// \left(x\right)^2 work correctly.
|
|
const base = this.parseGroup("atom", breakOnTokenText);
|
|
|
|
// In text mode, we don't have superscripts or subscripts
|
|
if (this.mode === "text") {
|
|
return base;
|
|
}
|
|
|
|
// Note that base may be empty (i.e. null) at this point.
|
|
|
|
let superscript;
|
|
let subscript;
|
|
while (true) {
|
|
// Guaranteed in math mode, so eat any spaces first.
|
|
this.consumeSpaces();
|
|
|
|
// Lex the first token
|
|
const lex = this.fetch();
|
|
|
|
if (lex.text === "\\limits" || lex.text === "\\nolimits") {
|
|
// We got a limit control
|
|
if (base && base.type === "op") {
|
|
const limits = lex.text === "\\limits";
|
|
base.limits = limits;
|
|
base.alwaysHandleSupSub = true;
|
|
} else if (base && base.type === "operatorname") {
|
|
if (base.alwaysHandleSupSub) {
|
|
base.limits = lex.text === "\\limits";
|
|
}
|
|
} else {
|
|
throw new ParseError(
|
|
"Limit controls must follow a math operator",
|
|
lex);
|
|
}
|
|
this.consume();
|
|
} else if (lex.text === "^") {
|
|
// We got a superscript start
|
|
if (superscript) {
|
|
throw new ParseError("Double superscript", lex);
|
|
}
|
|
superscript = this.handleSupSubscript("superscript");
|
|
} else if (lex.text === "_") {
|
|
// We got a subscript start
|
|
if (subscript) {
|
|
throw new ParseError("Double subscript", lex);
|
|
}
|
|
subscript = this.handleSupSubscript("subscript");
|
|
} else if (lex.text === "'") {
|
|
// We got a prime
|
|
if (superscript) {
|
|
throw new ParseError("Double superscript", lex);
|
|
}
|
|
const prime = {type: "textord", mode: this.mode, text: "\\prime"};
|
|
|
|
// Many primes can be grouped together, so we handle this here
|
|
const primes = [prime];
|
|
this.consume();
|
|
// Keep lexing tokens until we get something that's not a prime
|
|
while (this.fetch().text === "'") {
|
|
// For each one, add another prime to the list
|
|
primes.push(prime);
|
|
this.consume();
|
|
}
|
|
// If there's a superscript following the primes, combine that
|
|
// superscript in with the primes.
|
|
if (this.fetch().text === "^") {
|
|
primes.push(this.handleSupSubscript("superscript"));
|
|
}
|
|
// Put everything into an ordgroup as the superscript
|
|
superscript = {type: "ordgroup", mode: this.mode, body: primes};
|
|
} else if (uSubsAndSups[lex.text]) {
|
|
// A Unicode subscript or superscript character.
|
|
// We treat these similarly to the unicode-math package.
|
|
// So we render a string of Unicode (sub|super)scripts the
|
|
// same as a (sub|super)script of regular characters.
|
|
let str = uSubsAndSups[lex.text];
|
|
const isSub = unicodeSubRegEx.test(lex.text);
|
|
this.consume();
|
|
// Continue fetching tokens to fill out the string.
|
|
while (true) {
|
|
const token = this.fetch().text;
|
|
if (!(uSubsAndSups[token])) { break; }
|
|
if (unicodeSubRegEx.test(token) !== isSub) { break; }
|
|
this.consume();
|
|
str += uSubsAndSups[token];
|
|
}
|
|
// Now create a (sub|super)script.
|
|
const body = (new Parser(str, this.settings)).parse();
|
|
if (isSub) {
|
|
subscript = {type: "ordgroup", mode: "math", body};
|
|
} else {
|
|
superscript = {type: "ordgroup", mode: "math", body};
|
|
}
|
|
} else {
|
|
// If it wasn't ^, _, or ', stop parsing super/subscripts
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Base must be set if superscript or subscript are set per logic above,
|
|
// but need to check here for type check to pass.
|
|
if (superscript || subscript) {
|
|
// If we got either a superscript or subscript, create a supsub
|
|
return {
|
|
type: "supsub",
|
|
mode: this.mode,
|
|
base: base,
|
|
sup: superscript,
|
|
sub: subscript,
|
|
};
|
|
} else {
|
|
// Otherwise return the original body
|
|
return base;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parses an entire function, including its base and all of its arguments.
|
|
*/
|
|
parseFunction(
|
|
breakOnTokenText?: BreakToken,
|
|
name?: string, // For determining its context
|
|
): ?AnyParseNode {
|
|
const token = this.fetch();
|
|
const func = token.text;
|
|
const funcData = functions[func];
|
|
if (!funcData) {
|
|
return null;
|
|
}
|
|
this.consume(); // consume command token
|
|
|
|
if (name && name !== "atom" && !funcData.allowedInArgument) {
|
|
throw new ParseError(
|
|
"Got function '" + func + "' with no arguments" +
|
|
(name ? " as " + name : ""), token);
|
|
} else if (this.mode === "text" && !funcData.allowedInText) {
|
|
throw new ParseError(
|
|
"Can't use function '" + func + "' in text mode", token);
|
|
} else if (this.mode === "math" && funcData.allowedInMath === false) {
|
|
throw new ParseError(
|
|
"Can't use function '" + func + "' in math mode", token);
|
|
}
|
|
|
|
const {args, optArgs} = this.parseArguments(func, funcData);
|
|
return this.callFunction(func, args, optArgs, token, breakOnTokenText);
|
|
}
|
|
|
|
/**
|
|
* Call a function handler with a suitable context and arguments.
|
|
*/
|
|
callFunction(
|
|
name: string,
|
|
args: AnyParseNode[],
|
|
optArgs: (?AnyParseNode)[],
|
|
token?: Token,
|
|
breakOnTokenText?: BreakToken,
|
|
): AnyParseNode {
|
|
const context: FunctionContext = {
|
|
funcName: name,
|
|
parser: this,
|
|
token,
|
|
breakOnTokenText,
|
|
};
|
|
const func = functions[name];
|
|
if (func && func.handler) {
|
|
return func.handler(context, args, optArgs);
|
|
} else {
|
|
throw new ParseError(`No function handler for ${name}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parses the arguments of a function or environment
|
|
*/
|
|
parseArguments(
|
|
func: string, // Should look like "\name" or "\begin{name}".
|
|
funcData: FunctionSpec<*> | EnvSpec<*>,
|
|
): {
|
|
args: AnyParseNode[],
|
|
optArgs: (?AnyParseNode)[],
|
|
} {
|
|
const totalArgs = funcData.numArgs + funcData.numOptionalArgs;
|
|
if (totalArgs === 0) {
|
|
return {args: [], optArgs: []};
|
|
}
|
|
|
|
const args = [];
|
|
const optArgs = [];
|
|
|
|
for (let i = 0; i < totalArgs; i++) {
|
|
let argType = funcData.argTypes && funcData.argTypes[i];
|
|
const isOptional = i < funcData.numOptionalArgs;
|
|
|
|
if ((funcData.primitive && argType == null) ||
|
|
// \sqrt expands into primitive if optional argument doesn't exist
|
|
(funcData.type === "sqrt" && i === 1 && optArgs[0] == null)) {
|
|
argType = "primitive";
|
|
}
|
|
|
|
const arg = this.parseGroupOfType(`argument to '${func}'`,
|
|
argType, isOptional);
|
|
if (isOptional) {
|
|
optArgs.push(arg);
|
|
} else if (arg != null) {
|
|
args.push(arg);
|
|
} else { // should be unreachable
|
|
throw new ParseError("Null argument, please report this as a bug");
|
|
}
|
|
}
|
|
|
|
return {args, optArgs};
|
|
}
|
|
|
|
/**
|
|
* Parses a group when the mode is changing.
|
|
*/
|
|
parseGroupOfType(
|
|
name: string,
|
|
type: ?ArgType,
|
|
optional: boolean,
|
|
): ?AnyParseNode {
|
|
switch (type) {
|
|
case "color":
|
|
return this.parseColorGroup(optional);
|
|
case "size":
|
|
return this.parseSizeGroup(optional);
|
|
case "url":
|
|
return this.parseUrlGroup(optional);
|
|
case "math":
|
|
case "text":
|
|
return this.parseArgumentGroup(optional, type);
|
|
case "hbox": {
|
|
// hbox argument type wraps the argument in the equivalent of
|
|
// \hbox, which is like \text but switching to \textstyle size.
|
|
const group = this.parseArgumentGroup(optional, "text");
|
|
return group != null ? {
|
|
type: "styling",
|
|
mode: group.mode,
|
|
body: [group],
|
|
style: "text", // simulate \textstyle
|
|
} : null;
|
|
}
|
|
case "raw": {
|
|
const token = this.parseStringGroup("raw", optional);
|
|
return token != null ? {
|
|
type: "raw",
|
|
mode: "text",
|
|
string: token.text,
|
|
} : null;
|
|
}
|
|
case "primitive": {
|
|
if (optional) {
|
|
throw new ParseError("A primitive argument cannot be optional");
|
|
}
|
|
const group = this.parseGroup(name);
|
|
if (group == null) {
|
|
throw new ParseError("Expected group as " + name, this.fetch());
|
|
}
|
|
return group;
|
|
}
|
|
case "original":
|
|
case null:
|
|
case undefined:
|
|
return this.parseArgumentGroup(optional);
|
|
default:
|
|
throw new ParseError(
|
|
"Unknown group type as " + name, this.fetch());
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Discard any space tokens, fetching the next non-space token.
|
|
*/
|
|
consumeSpaces() {
|
|
while (this.fetch().text === " ") {
|
|
this.consume();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parses a group, essentially returning the string formed by the
|
|
* brace-enclosed tokens plus some position information.
|
|
*/
|
|
parseStringGroup(
|
|
modeName: ArgType, // Used to describe the mode in error messages.
|
|
optional: boolean,
|
|
): ?Token {
|
|
const argToken = this.gullet.scanArgument(optional);
|
|
if (argToken == null) {
|
|
return null;
|
|
}
|
|
let str = "";
|
|
let nextToken;
|
|
while ((nextToken = this.fetch()).text !== "EOF") {
|
|
str += nextToken.text;
|
|
this.consume();
|
|
}
|
|
this.consume(); // consume the end of the argument
|
|
argToken.text = str;
|
|
return argToken;
|
|
}
|
|
|
|
/**
|
|
* Parses a regex-delimited group: the largest sequence of tokens
|
|
* whose concatenated strings match `regex`. Returns the string
|
|
* formed by the tokens plus some position information.
|
|
*/
|
|
parseRegexGroup(
|
|
regex: RegExp,
|
|
modeName: string, // Used to describe the mode in error messages.
|
|
): Token {
|
|
const firstToken = this.fetch();
|
|
let lastToken = firstToken;
|
|
let str = "";
|
|
let nextToken;
|
|
while ((nextToken = this.fetch()).text !== "EOF" &&
|
|
regex.test(str + nextToken.text)) {
|
|
lastToken = nextToken;
|
|
str += lastToken.text;
|
|
this.consume();
|
|
}
|
|
if (str === "") {
|
|
throw new ParseError(
|
|
"Invalid " + modeName + ": '" + firstToken.text + "'",
|
|
firstToken);
|
|
}
|
|
return firstToken.range(lastToken, str);
|
|
}
|
|
|
|
/**
|
|
* Parses a color description.
|
|
*/
|
|
parseColorGroup(optional: boolean): ?ParseNode<"color-token"> {
|
|
const res = this.parseStringGroup("color", optional);
|
|
if (res == null) {
|
|
return null;
|
|
}
|
|
const match = (/^(#[a-f0-9]{3}|#?[a-f0-9]{6}|[a-z]+)$/i).exec(res.text);
|
|
if (!match) {
|
|
throw new ParseError("Invalid color: '" + res.text + "'", res);
|
|
}
|
|
let color = match[0];
|
|
if (/^[0-9a-f]{6}$/i.test(color)) {
|
|
// We allow a 6-digit HTML color spec without a leading "#".
|
|
// This follows the xcolor package's HTML color model.
|
|
// Predefined color names are all missed by this RegEx pattern.
|
|
color = "#" + color;
|
|
}
|
|
return {
|
|
type: "color-token",
|
|
mode: this.mode,
|
|
color,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Parses a size specification, consisting of magnitude and unit.
|
|
*/
|
|
parseSizeGroup(optional: boolean): ?ParseNode<"size"> {
|
|
let res;
|
|
let isBlank = false;
|
|
// don't expand before parseStringGroup
|
|
this.gullet.consumeSpaces();
|
|
if (!optional && this.gullet.future().text !== "{") {
|
|
res = this.parseRegexGroup(
|
|
/^[-+]? *(?:$|\d+|\d+\.\d*|\.\d*) *[a-z]{0,2} *$/, "size");
|
|
} else {
|
|
res = this.parseStringGroup("size", optional);
|
|
}
|
|
if (!res) {
|
|
return null;
|
|
}
|
|
if (!optional && res.text.length === 0) {
|
|
// Because we've tested for what is !optional, this block won't
|
|
// affect \kern, \hspace, etc. It will capture the mandatory arguments
|
|
// to \genfrac and \above.
|
|
res.text = "0pt"; // Enable \above{}
|
|
isBlank = true; // This is here specifically for \genfrac
|
|
}
|
|
const match = (/([-+]?) *(\d+(?:\.\d*)?|\.\d+) *([a-z]{2})/).exec(res.text);
|
|
if (!match) {
|
|
throw new ParseError("Invalid size: '" + res.text + "'", res);
|
|
}
|
|
const data = {
|
|
number: +(match[1] + match[2]), // sign + magnitude, cast to number
|
|
unit: match[3],
|
|
};
|
|
if (!validUnit(data)) {
|
|
throw new ParseError("Invalid unit: '" + data.unit + "'", res);
|
|
}
|
|
return {
|
|
type: "size",
|
|
mode: this.mode,
|
|
value: data,
|
|
isBlank,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Parses an URL, checking escaped letters and allowed protocols,
|
|
* and setting the catcode of % as an active character (as in \hyperref).
|
|
*/
|
|
parseUrlGroup(optional: boolean): ?ParseNode<"url"> {
|
|
this.gullet.lexer.setCatcode("%", 13); // active character
|
|
this.gullet.lexer.setCatcode("~", 12); // other character
|
|
const res = this.parseStringGroup("url", optional);
|
|
this.gullet.lexer.setCatcode("%", 14); // comment character
|
|
this.gullet.lexer.setCatcode("~", 13); // active character
|
|
if (res == null) {
|
|
return null;
|
|
}
|
|
// hyperref package allows backslashes alone in href, but doesn't
|
|
// generate valid links in such cases; we interpret this as
|
|
// "undefined" behaviour, and keep them as-is. Some browser will
|
|
// replace backslashes with forward slashes.
|
|
const url = res.text.replace(/\\([#$%&~_^{}])/g, '$1');
|
|
return {
|
|
type: "url",
|
|
mode: this.mode,
|
|
url,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Parses an argument with the mode specified.
|
|
*/
|
|
parseArgumentGroup(optional: boolean, mode?: Mode): ?ParseNode<"ordgroup"> {
|
|
const argToken = this.gullet.scanArgument(optional);
|
|
if (argToken == null) {
|
|
return null;
|
|
}
|
|
const outerMode = this.mode;
|
|
if (mode) { // Switch to specified mode
|
|
this.switchMode(mode);
|
|
}
|
|
|
|
this.gullet.beginGroup();
|
|
const expression = this.parseExpression(false, "EOF");
|
|
// TODO: find an alternative way to denote the end
|
|
this.expect("EOF"); // expect the end of the argument
|
|
this.gullet.endGroup();
|
|
const result = {
|
|
type: "ordgroup",
|
|
mode: this.mode,
|
|
loc: argToken.loc,
|
|
body: expression,
|
|
};
|
|
|
|
if (mode) { // Switch mode back
|
|
this.switchMode(outerMode);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Parses an ordinary group, which is either a single nucleus (like "x")
|
|
* or an expression in braces (like "{x+y}") or an implicit group, a group
|
|
* that starts at the current position, and ends right before a higher explicit
|
|
* group ends, or at EOF.
|
|
*/
|
|
parseGroup(
|
|
name: string, // For error reporting.
|
|
breakOnTokenText?: BreakToken,
|
|
): ?AnyParseNode {
|
|
const firstToken = this.fetch();
|
|
const text = firstToken.text;
|
|
|
|
let result;
|
|
// Try to parse an open brace or \begingroup
|
|
if (text === "{" || text === "\\begingroup") {
|
|
this.consume();
|
|
const groupEnd = text === "{" ? "}" : "\\endgroup";
|
|
|
|
this.gullet.beginGroup();
|
|
// If we get a brace, parse an expression
|
|
const expression = this.parseExpression(false, groupEnd);
|
|
const lastToken = this.fetch();
|
|
this.expect(groupEnd); // Check that we got a matching closing brace
|
|
this.gullet.endGroup();
|
|
result = {
|
|
type: "ordgroup",
|
|
mode: this.mode,
|
|
loc: SourceLocation.range(firstToken, lastToken),
|
|
body: expression,
|
|
// A group formed by \begingroup...\endgroup is a semi-simple group
|
|
// which doesn't affect spacing in math mode, i.e., is transparent.
|
|
// https://tex.stackexchange.com/questions/1930/when-should-one-
|
|
// use-begingroup-instead-of-bgroup
|
|
semisimple: text === "\\begingroup" || undefined,
|
|
};
|
|
} else {
|
|
// If there exists a function with this name, parse the function.
|
|
// Otherwise, just return a nucleus
|
|
result = this.parseFunction(breakOnTokenText, name) ||
|
|
this.parseSymbol();
|
|
if (result == null && text[0] === "\\" &&
|
|
!implicitCommands.hasOwnProperty(text)) {
|
|
if (this.settings.throwOnError) {
|
|
throw new ParseError(
|
|
"Undefined control sequence: " + text, firstToken);
|
|
}
|
|
result = this.formatUnsupportedCmd(text);
|
|
this.consume();
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Form ligature-like combinations of characters for text mode.
|
|
* This includes inputs like "--", "---", "``" and "''".
|
|
* The result will simply replace multiple textord nodes with a single
|
|
* character in each value by a single textord node having multiple
|
|
* characters in its value. The representation is still ASCII source.
|
|
* The group will be modified in place.
|
|
*/
|
|
formLigatures(group: AnyParseNode[]) {
|
|
let n = group.length - 1;
|
|
for (let i = 0; i < n; ++i) {
|
|
const a = group[i];
|
|
// $FlowFixMe: Not every node type has a `text` property.
|
|
const v = a.text;
|
|
if (v === "-" && group[i + 1].text === "-") {
|
|
if (i + 1 < n && group[i + 2].text === "-") {
|
|
group.splice(i, 3, {
|
|
type: "textord",
|
|
mode: "text",
|
|
loc: SourceLocation.range(a, group[i + 2]),
|
|
text: "---",
|
|
});
|
|
n -= 2;
|
|
} else {
|
|
group.splice(i, 2, {
|
|
type: "textord",
|
|
mode: "text",
|
|
loc: SourceLocation.range(a, group[i + 1]),
|
|
text: "--",
|
|
});
|
|
n -= 1;
|
|
}
|
|
}
|
|
if ((v === "'" || v === "`") && group[i + 1].text === v) {
|
|
group.splice(i, 2, {
|
|
type: "textord",
|
|
mode: "text",
|
|
loc: SourceLocation.range(a, group[i + 1]),
|
|
text: v + v,
|
|
});
|
|
n -= 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse a single symbol out of the string. Here, we handle single character
|
|
* symbols and special functions like \verb.
|
|
*/
|
|
parseSymbol(): ?AnyParseNode {
|
|
const nucleus = this.fetch();
|
|
let text = nucleus.text;
|
|
|
|
if (/^\\verb[^a-zA-Z]/.test(text)) {
|
|
this.consume();
|
|
let arg = text.slice(5);
|
|
const star = (arg.charAt(0) === "*");
|
|
if (star) {
|
|
arg = arg.slice(1);
|
|
}
|
|
// Lexer's tokenRegex is constructed to always have matching
|
|
// first/last characters.
|
|
if (arg.length < 2 || arg.charAt(0) !== arg.slice(-1)) {
|
|
throw new ParseError(`\\verb assertion failed --
|
|
please report what input caused this bug`);
|
|
}
|
|
arg = arg.slice(1, -1); // remove first and last char
|
|
return {
|
|
type: "verb",
|
|
mode: "text",
|
|
body: arg,
|
|
star,
|
|
};
|
|
}
|
|
// At this point, we should have a symbol, possibly with accents.
|
|
// First expand any accented base symbol according to unicodeSymbols.
|
|
if (unicodeSymbols.hasOwnProperty(text[0]) &&
|
|
!symbols[this.mode][text[0]]) {
|
|
// This behavior is not strict (XeTeX-compatible) in math mode.
|
|
if (this.settings.strict && this.mode === "math") {
|
|
this.settings.reportNonstrict("unicodeTextInMathMode",
|
|
`Accented Unicode text character "${text[0]}" used in ` +
|
|
`math mode`, nucleus);
|
|
}
|
|
text = unicodeSymbols[text[0]] + text.slice(1);
|
|
}
|
|
// Strip off any combining characters
|
|
const match = combiningDiacriticalMarksEndRegex.exec(text);
|
|
if (match) {
|
|
text = text.substring(0, match.index);
|
|
if (text === 'i') {
|
|
text = '\u0131'; // dotless i, in math and text mode
|
|
} else if (text === 'j') {
|
|
text = '\u0237'; // dotless j, in math and text mode
|
|
}
|
|
}
|
|
// Recognize base symbol
|
|
let symbol: AnyParseNode;
|
|
if (symbols[this.mode][text]) {
|
|
if (this.settings.strict && this.mode === 'math' &&
|
|
extraLatin.indexOf(text) >= 0) {
|
|
this.settings.reportNonstrict("unicodeTextInMathMode",
|
|
`Latin-1/Unicode text character "${text[0]}" used in ` +
|
|
`math mode`, nucleus);
|
|
}
|
|
const group: Group = symbols[this.mode][text].group;
|
|
const loc = SourceLocation.range(nucleus);
|
|
let s: SymbolParseNode;
|
|
if (ATOMS.hasOwnProperty(group)) {
|
|
// $FlowFixMe
|
|
const family: Atom = group;
|
|
s = {
|
|
type: "atom",
|
|
mode: this.mode,
|
|
family,
|
|
loc,
|
|
text,
|
|
};
|
|
} else {
|
|
// $FlowFixMe
|
|
s = {
|
|
type: group,
|
|
mode: this.mode,
|
|
loc,
|
|
text,
|
|
};
|
|
}
|
|
// $FlowFixMe
|
|
symbol = s;
|
|
} else if (text.charCodeAt(0) >= 0x80) { // no symbol for e.g. ^
|
|
if (this.settings.strict) {
|
|
if (!supportedCodepoint(text.charCodeAt(0))) {
|
|
this.settings.reportNonstrict("unknownSymbol",
|
|
`Unrecognized Unicode character "${text[0]}"` +
|
|
` (${text.charCodeAt(0)})`, nucleus);
|
|
} else if (this.mode === "math") {
|
|
this.settings.reportNonstrict("unicodeTextInMathMode",
|
|
`Unicode text character "${text[0]}" used in math mode`,
|
|
nucleus);
|
|
}
|
|
}
|
|
// All nonmathematical Unicode characters are rendered as if they
|
|
// are in text mode (wrapped in \text) because that's what it
|
|
// takes to render them in LaTeX. Setting `mode: this.mode` is
|
|
// another natural choice (the user requested math mode), but
|
|
// this makes it more difficult for getCharacterMetrics() to
|
|
// distinguish Unicode characters without metrics and those for
|
|
// which we want to simulate the letter M.
|
|
symbol = {
|
|
type: "textord",
|
|
mode: "text",
|
|
loc: SourceLocation.range(nucleus),
|
|
text,
|
|
};
|
|
} else {
|
|
return null; // EOF, ^, _, {, }, etc.
|
|
}
|
|
this.consume();
|
|
// Transform combining characters into accents
|
|
if (match) {
|
|
for (let i = 0; i < match[0].length; i++) {
|
|
const accent: string = match[0][i];
|
|
if (!unicodeAccents[accent]) {
|
|
throw new ParseError(`Unknown accent ' ${accent}'`, nucleus);
|
|
}
|
|
const command = unicodeAccents[accent][this.mode] ||
|
|
unicodeAccents[accent].text;
|
|
if (!command) {
|
|
throw new ParseError(
|
|
`Accent ${accent} unsupported in ${this.mode} mode`,
|
|
nucleus);
|
|
}
|
|
symbol = {
|
|
type: "accent",
|
|
mode: this.mode,
|
|
loc: SourceLocation.range(nucleus),
|
|
label: command,
|
|
isStretchy: false,
|
|
isShifty: true,
|
|
// $FlowFixMe
|
|
base: symbol,
|
|
};
|
|
}
|
|
}
|
|
// $FlowFixMe
|
|
return symbol;
|
|
}
|
|
}
|