// @flow /** * The Lexer class handles tokenizing the input in various ways. Since our * parser expects us to be able to backtrack, the lexer allows lexing from any * given starting point. * * Its main exposed function is the `lex` function, which takes a position to * lex from and a type of token to lex. It defers to the appropriate `_innerLex` * function. * * The various `_innerLex` functions perform the actual lexing of different * kinds. */ import ParseError from "./ParseError"; import SourceLocation from "./SourceLocation"; import {Token} from "./Token"; import type {LexerInterface} from "./Token"; import type Settings from "./Settings"; /* The following tokenRegex * - matches typical whitespace (but not NBSP etc.) using its first group * - does not match any control character \x00-\x1f except whitespace * - does not match a bare backslash * - matches any ASCII character except those just mentioned * - does not match the BMP private use area \uE000-\uF8FF * - does not match bare surrogate code units * - matches any BMP character except for those just described * - matches any valid Unicode surrogate pair * - matches a backslash followed by one or more whitespace characters * - matches a backslash followed by one or more letters then whitespace * - matches a backslash followed by any BMP character * Capturing groups: * [1] regular whitespace * [2] backslash followed by whitespace * [3] anything else, which may include: * [4] left character of \verb* * [5] left character of \verb * [6] backslash followed by word, excluding any trailing whitespace * Just because the Lexer matches something doesn't mean it's valid input: * If there is no matching function or symbol definition, the Parser will * still reject the input. */ const spaceRegexString = "[ \r\n\t]"; const controlWordRegexString = "\\\\[a-zA-Z@]+"; const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]"; const controlWordWhitespaceRegexString = `(${controlWordRegexString})${spaceRegexString}*`; const controlSpaceRegexString = "\\\\(\n|[ \r\t]+\n?)[ \r\t]*"; const combiningDiacriticalMarkString = "[\u0300-\u036f]"; export const combiningDiacriticalMarksEndRegex: RegExp = new RegExp(`${combiningDiacriticalMarkString}+$`); const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace `${controlSpaceRegexString}|` + // \whitespace "([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint `${combiningDiacriticalMarkString}*` + // ...plus accents "|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair `${combiningDiacriticalMarkString}*` + // ...plus accents "|\\\\verb\\*([^]).*?\\4" + // \verb* "|\\\\verb([^*a-zA-Z]).*?\\5" + // \verb unstarred `|${controlWordWhitespaceRegexString}` + // \macroName + spaces `|${controlSymbolRegexString})`; // \\, \', etc. /** Main Lexer class */ export default class Lexer implements LexerInterface { input: string; settings: Settings; tokenRegex: RegExp; // Category codes. The lexer only supports comment characters (14) for now. // MacroExpander additionally distinguishes active (13). catcodes: {[string]: number}; constructor(input: string, settings: Settings) { // Separate accents from characters this.input = input; this.settings = settings; this.tokenRegex = new RegExp(tokenRegexString, 'g'); this.catcodes = { "%": 14, // comment character "~": 13, // active character }; } setCatcode(char: string, code: number) { this.catcodes[char] = code; } /** * This function lexes a single token. */ lex(): Token { const input = this.input; const pos = this.tokenRegex.lastIndex; if (pos === input.length) { return new Token("EOF", new SourceLocation(this, pos, pos)); } const match = this.tokenRegex.exec(input); if (match === null || match.index !== pos) { throw new ParseError( `Unexpected character: '${input[pos]}'`, new Token(input[pos], new SourceLocation(this, pos, pos + 1))); } const text = match[6] || match[3] || (match[2] ? "\\ " : " "); if (this.catcodes[text] === 14) { // comment character const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex); if (nlIndex === -1) { this.tokenRegex.lastIndex = input.length; // EOF this.settings.reportNonstrict("commentAtEnd", "% comment has no terminating newline; LaTeX would " + "fail because of commenting the end of math mode (e.g. $)"); } else { this.tokenRegex.lastIndex = nlIndex + 1; } return this.lex(); } return new Token(text, new SourceLocation(this, pos, this.tokenRegex.lastIndex)); } }