'use strict';
const hljs = require('../../build');
const { BFS, parseRegex, regexFor } = require('./lib/util.js');
const { visitRegExpAST } = require('regexpp');
const { JS, Words, NFA, CharSet } = require('refa');
const { firstOf, underAStar, isFirstMatch, isAlwaysZeroWidth} = require('./lib/analysis.js');
hljs.debugMode();
const expBacktrackingCache = {};
const polyBacktrackingCache = {};
function retrieveRules(language, { name }) {
hljs.highlight("", {language: name});
return regexFor(language, { context: name, depth: 0 });
}
function forEachPattern(list, fn) {
const errors = [];
for (const rule of list) {
const ast = parseRegex(rule.re);
fn({
ast,
pattern: rule.re,
rulePath: rule.path,
reportError: message => errors.push(message)
});
};
if (errors.length > 0) {
throw new Error(errors.map(e => String(e.message || e)).join('\n\n'));
}
}
function testLanguage(languageName) {
const language = hljs.getLanguage(languageName);
const rules = retrieveRules(language, { name: languageName });
count += rules.length;
describe(languageName, function() {
it("have a name", function() {
language.name.should.not.equal(undefined);
});
it(`have ${rules.length} regex matchers`, () => {} );
it('should not use octal escapes', function() {
forEachPattern(rules, ({ ast, rulePath, reportError }) => {
visitRegExpAST(ast.pattern, {
onCharacterEnter(node) {
if (/^\\(?:[1-9]|\d{2,})$/.test(node.raw)) {
reportError(`${rulePath}: Octal escape ${node.raw}.\n\n` +
`Octal escapes can be confused with backreferences, so please do not use them.\n` +
`To fix this, use a different escape method. ` +
`Note that this could also be an invalid backreference, so be sure to carefully analyse the pattern.`);
}
}
});
});
});
it('should not cause exponential backtracking', function () {
forEachPattern(rules, ({ pattern, ast, rulePath, reportError }) => {
const patternStr = String(pattern);
if (expBacktrackingCache[patternStr] === false) {
return;
}
const parser = JS.Parser.fromAst(ast);
function toNFA(element, debug = false) {
const { expression, maxCharacter } = parser.parseElement(element, {
backreferences: "resolve",
lookarounds: "disable",
});
return NFA.fromRegex(expression, { maxCharacter });
}
function checkDisjointAlternatives(node) {
if (!underAStar(node) || node.alternatives.length < 2) {
return;
}
const alternatives = node.alternatives;
const total = toNFA(alternatives[0]);
total.removeEmptyWord();
for (let i = 1, l = alternatives.length; i < l; i++) {
const a = alternatives[i];
const current = toNFA(a);
current.removeEmptyWord();
if (!total.isDisjointWith(current)) {
reportError(`${rulePath}: The alternative \`${a.raw}\` is not disjoint with at least one previous alternative.`
+ ` This will cause exponential backtracking.`
+ `\n\nTo fix this issue, you have to rewrite the ${node.type} \`${node.raw}\`.`
+ ` The goal is that all of its alternatives are disjoint.`
+ ` This means that if a (sub-)string is matched by the ${node.type}, then only one of its alternatives can match the (sub-)string.`
+ `\n\nExample: \`(?:[ab]|\\w|::)+\``
+ `\nThe alternatives of the group are not disjoint because the string "a" can be matched by both \`[ab]\` and \`\\w\`.`
+ ` In this example, the pattern by easily fixed because the \`[ab]\` is a subset of the \`\\w\`, so its enough to remove the \`[ab]\` alternative to get \`(?:\\w|::)+\` as the fixed pattern.`
+ `\nIn the real world, patterns can be a lot harder to fix.`
+ ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.`
+ ` A maintainer will help you.`
+ `\n\nFull pattern:\n${pattern}`);
} else if (i !== l - 1) {
total.union(current);
}
}
}
visitRegExpAST(ast.pattern, {
onCapturingGroupLeave: checkDisjointAlternatives,
onGroupLeave: checkDisjointAlternatives,
onAssertionLeave(node) {
if (node.kind === "lookahead" || node.kind === "lookbehind") {
checkDisjointAlternatives(node);
}
},
onQuantifierLeave(node) {
if (node.max < 10) {
return;
}
if (node.element.type !== "CapturingGroup" && node.element.type !== "Group") {
return;
}
const nfa = toNFA(node.element, true);
nfa.removeEmptyWord();
const twoStar = nfa.copy();
twoStar.quantify(2, Infinity);
if (!nfa.isDisjointWith(twoStar)) {
const example = Words.fromUnicodeToString(firstOf(NFA.intersectionWords(nfa, twoStar)));
reportError(`${rulePath}: The quantifier \`${node.raw}\` ambiguous for all words ${JSON.stringify(example)}.repeat(n) for any n>1.`
+ ` This will cause exponential backtracking.`
+ `\n\nTo fix this issue, you have to rewrite the element (let's call it E) of the quantifier.`
+ ` The goal is modify E such that it is disjoint with repetitions of itself.`
+ ` This means that if a (sub-)string is matched by E, then it must not be possible for E{2}, E{3}, E{4}, etc. to match that (sub-)string.`
+ `\n\nExample: \`(?:\\w+|::)+\``
+ `\nThe problem lies in \`\\w+\` because \`\\w+\` and \`(?:\\w+){2}\` are not disjoint as the string "aa" is fully matched by both.`
+ ` In this example, the pattern by easily fixed by changing \`\\w+\` to \`\\w\`.`
+ `\nIn the real world, patterns can be a lot harder to fix.`
+ ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.`
+ ` A maintainer will help you.`
+ `\n\nFull pattern:\n${pattern}`);
}
},
});
expBacktrackingCache[patternStr] = false;
});
});
it('should not cause polynomial backtracking', function () {
forEachPattern(rules, ({ pattern, ast, rulePath, reportError }) => {
const patternStr = String(pattern);
if (polyBacktrackingCache[patternStr] === false) {
return;
}
const EMPTY = ast.flags.unicode ? CharSet.empty(0x10FFFF) : CharSet.empty(0xFFFF);
function toCharSet(node) {
switch (node.type) {
case "Alternative": {
if (node.elements.length === 1) {
return toCharSet(node.elements[0]);
}
return EMPTY;
}
case "CapturingGroup":
case "Group": {
let total = EMPTY;
for (const item of node.alternatives) {
total = total.union(toCharSet(item));
}
return total;
}
case "Character":
return JS.createCharSet([node.value], ast.flags);
case "CharacterClass": {
const value = JS.createCharSet(node.elements.map(x => {
if (x.type === "CharacterSet") {
return x;
} else if (x.type === "Character") {
return x.value;
} else {
return { min: x.min.value, max: x.max.value };
}
}), ast.flags);
if (node.negate) {
return value.negate();
} else {
return value;
}
}
case "CharacterSet":
return JS.createCharSet([node], ast.flags);
default:
return EMPTY;
}
}
function getAfter(from) {
const parent = from.parent;
if (parent.type === "Quantifier") {
return getAfter(parent);
} else if (parent.type === "Alternative") {
const index = parent.elements.indexOf(from);
const after = parent.elements[index + 1];
if (after) {
return after;
} else {
const grandParent = parent.parent;
if (grandParent.type === "Pattern") {
return null;
} else {
return getAfter(grandParent);
}
}
} else {
throw Error("Unreachable");
}
}
visitRegExpAST(ast.pattern, {
onQuantifierLeave(node) {
if (node.max !== Infinity) {
return;
}
const char = toCharSet(node.element);
tryReachUntil(getAfter(node), char, null);
function assertNoPoly(quantifier, char) {
if (quantifier.max === Infinity) {
const qChar = toCharSet(quantifier.element);
if (qChar && !qChar.isDisjointWith(char)) {
const intersection = qChar.intersect(char);
const literal = JS.toLiteral({
type: "Concatenation",
elements: [
{ type: "CharacterClass", characters: intersection }
]
})
const lang = `/${literal.source}/${literal.flags}`;
const rangeStr = patternStr.substring(node.start + 1, quantifier.end + 1);
const rangeHighlight = `^${"~".repeat(node.end - node.start - 1)}${" ".repeat(quantifier.start - node.end)}^${"~".repeat(quantifier.end - quantifier.start - 1)}`;
reportError(`${rulePath}: Polynomial backtracking. By repeating any character that matches ${lang}, an attack string can be created.\n\n ${rangeStr}\n ${rangeHighlight}\n\nFull pattern:\n${patternStr}\n${" ".repeat(node.start + 1)}${rangeHighlight}`);
}
}
}
function tryReachUntil(element, char, until) {
if (!element || element == until || char.isEmpty) {
return char;
}
const after = getAfter(element);
if (element.type === "Quantifier") {
assertNoPoly(element, char);
}
return tryReachUntil(after, goInto(element, after, char), until);
}
function goInto(element, after, char) {
switch (element.type) {
case "Assertion": {
if (element.kind === "lookahead" || element.kind === "lookbehind") {
for (const alt of element.alternatives) {
if (alt.elements.length > 0) {
tryReachUntil(alt.elements[0], char, after);
}
}
}
return EMPTY;
}
case "Group":
case "CapturingGroup": {
let total = EMPTY;
for (const alt of element.alternatives) {
if (alt.elements.length > 0) {
total = total.union(tryReachUntil(alt.elements[0], char, after));
} else {
total = char;
}
}
return total;
}
case "Character":
case "CharacterClass":
case "CharacterSet": {
return char.intersect(toCharSet(element));
}
case "Quantifier": {
if (element.min === 0) {
goInto(element.element, after, char);
return char;
} else {
return goInto(element.element, after, char);
}
}
default:
return EMPTY;
}
}
},
});
polyBacktrackingCache[patternStr] = false;
});
});
});
}
let count = 0;
let languages = hljs.listLanguages();
if (process.env.ONLY_LANG) {
languages = [process.env.ONLY_LANG];
}
for (const language of languages) {
testLanguage(language);
}
describe("COMBINED: All grammars", () => {
it(`have ${count} total regex`, () => {});
});