Files
jspg/validator/src/ecma.rs

198 lines
5.1 KiB
Rust

use std::borrow::Cow;
use regex_syntax::ast::parse::Parser;
use regex_syntax::ast::{self, *};
// covert ecma regex to rust regex if possible
// see https://262.ecma-international.org/11.0/#sec-regexp-regular-expression-objects
pub(crate) fn convert(pattern: &str) -> Result<Cow<'_, str>, Box<dyn std::error::Error>> {
let mut pattern = Cow::Borrowed(pattern);
let mut ast = loop {
match Parser::new().parse(pattern.as_ref()) {
Ok(ast) => break ast,
Err(e) => {
if let Some(s) = fix_error(&e) {
pattern = Cow::Owned(s);
} else {
Err(e)?;
}
}
}
};
loop {
let translator = Translator {
pat: pattern.as_ref(),
out: None,
};
if let Some(updated_pattern) = ast::visit(&ast, translator)? {
match Parser::new().parse(&updated_pattern) {
Ok(updated_ast) => {
pattern = Cow::Owned(updated_pattern);
ast = updated_ast;
}
Err(e) => {
debug_assert!(
false,
"ecma::translate changed {:?} to {:?}: {e}",
pattern, updated_pattern
);
break;
}
}
} else {
break;
}
}
Ok(pattern)
}
fn fix_error(e: &Error) -> Option<String> {
if let ErrorKind::EscapeUnrecognized = e.kind() {
let (start, end) = (e.span().start.offset, e.span().end.offset);
let s = &e.pattern()[start..end];
if let r"\c" = s {
// handle \c{control_letter}
if let Some(control_letter) = e.pattern()[end..].chars().next() {
if control_letter.is_ascii_alphabetic() {
return Some(format!(
"{}{}{}",
&e.pattern()[..start],
((control_letter as u8) % 32) as char,
&e.pattern()[end + 1..],
));
}
}
}
}
None
}
/**
handles following translations:
- \d should ascii digits only. so replace with [0-9]
- \D should match everything but ascii digits. so replace with [^0-9]
- \w should match ascii letters only. so replace with [a-zA-Z0-9_]
- \W should match everything but ascii letters. so replace with [^a-zA-Z0-9_]
- \s and \S differences
- \a is not an ECMA 262 control escape
*/
struct Translator<'a> {
pat: &'a str,
out: Option<String>,
}
impl Translator<'_> {
fn replace(&mut self, span: &Span, with: &str) {
let (start, end) = (span.start.offset, span.end.offset);
self.out = Some(format!("{}{with}{}", &self.pat[..start], &self.pat[end..]));
}
fn replace_class_class(&mut self, perl: &ClassPerl) {
match perl.kind {
ClassPerlKind::Digit => {
self.replace(&perl.span, if perl.negated { "[^0-9]" } else { "[0-9]" });
}
ClassPerlKind::Word => {
let with = &if perl.negated {
"[^A-Za-z0-9_]"
} else {
"[A-Za-z0-9_]"
};
self.replace(&perl.span, with);
}
ClassPerlKind::Space => {
let with = &if perl.negated {
"[^ \t\n\r\u{000b}\u{000c}\u{00a0}\u{feff}\u{2003}\u{2029}]"
} else {
"[ \t\n\r\u{000b}\u{000c}\u{00a0}\u{feff}\u{2003}\u{2029}]"
};
self.replace(&perl.span, with);
}
}
}
}
impl Visitor for Translator<'_> {
type Output = Option<String>;
type Err = &'static str;
fn finish(self) -> Result<Self::Output, Self::Err> {
Ok(self.out)
}
fn visit_class_set_item_pre(&mut self, ast: &ast::ClassSetItem) -> Result<(), Self::Err> {
if let ClassSetItem::Perl(perl) = ast {
self.replace_class_class(perl);
}
Ok(())
}
fn visit_post(&mut self, ast: &Ast) -> Result<(), Self::Err> {
if self.out.is_some() {
return Ok(());
}
match ast {
Ast::ClassPerl(perl) => {
self.replace_class_class(perl);
}
Ast::Literal(ref literal) => {
if let Literal {
kind: LiteralKind::Special(SpecialLiteralKind::Bell),
..
} = literal.as_ref()
{
return Err("\\a is not an ECMA 262 control escape");
}
}
_ => (),
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ecma_compat_valid() {
// println!("{:#?}", Parser::new().parse(r#"a\a"#));
let tests = [
(r"ab\cAcde\cBfg", "ab\u{1}cde\u{2}fg"), // \c{control_letter}
(r"\\comment", r"\\comment"), // there is no \c
(r"ab\def", r#"ab[0-9]ef"#), // \d
(r"ab[a-z\d]ef", r#"ab[a-z[0-9]]ef"#), // \d inside classSet
(r"ab\Def", r#"ab[^0-9]ef"#), // \d
(r"ab[a-z\D]ef", r#"ab[a-z[^0-9]]ef"#), // \D inside classSet
];
for (input, want) in tests {
match convert(input) {
Ok(got) => {
if got.as_ref() != want {
panic!("convert({input:?}): got: {got:?}, want: {want:?}");
}
}
Err(e) => {
panic!("convert({input:?}) failed: {e}");
}
}
}
}
#[test]
fn test_ecma_compat_invalid() {
// println!("{:#?}", Parser::new().parse(r#"a\a"#));
let tests = [
r"\c\n", // \c{invalid_char}
r"abc\adef", // \a is not valid
];
for input in tests {
if convert(input).is_ok() {
panic!("convert({input:?}) mut fail");
}
}
}
}