198 lines
5.1 KiB
Rust
198 lines
5.1 KiB
Rust
use std::borrow::Cow;
|
|
|
|
use regex_syntax::ast::parse::Parser;
|
|
use regex_syntax::ast::{self, *};
|
|
|
|
// covert ecma regex to rust regex if possible
|
|
// see https://262.ecma-international.org/11.0/#sec-regexp-regular-expression-objects
|
|
pub(crate) fn convert(pattern: &str) -> Result<Cow<'_, str>, Box<dyn std::error::Error>> {
|
|
let mut pattern = Cow::Borrowed(pattern);
|
|
|
|
let mut ast = loop {
|
|
match Parser::new().parse(pattern.as_ref()) {
|
|
Ok(ast) => break ast,
|
|
Err(e) => {
|
|
if let Some(s) = fix_error(&e) {
|
|
pattern = Cow::Owned(s);
|
|
} else {
|
|
Err(e)?;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
loop {
|
|
let translator = Translator {
|
|
pat: pattern.as_ref(),
|
|
out: None,
|
|
};
|
|
if let Some(updated_pattern) = ast::visit(&ast, translator)? {
|
|
match Parser::new().parse(&updated_pattern) {
|
|
Ok(updated_ast) => {
|
|
pattern = Cow::Owned(updated_pattern);
|
|
ast = updated_ast;
|
|
}
|
|
Err(e) => {
|
|
debug_assert!(
|
|
false,
|
|
"ecma::translate changed {:?} to {:?}: {e}",
|
|
pattern, updated_pattern
|
|
);
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
Ok(pattern)
|
|
}
|
|
|
|
fn fix_error(e: &Error) -> Option<String> {
|
|
if let ErrorKind::EscapeUnrecognized = e.kind() {
|
|
let (start, end) = (e.span().start.offset, e.span().end.offset);
|
|
let s = &e.pattern()[start..end];
|
|
if let r"\c" = s {
|
|
// handle \c{control_letter}
|
|
if let Some(control_letter) = e.pattern()[end..].chars().next() {
|
|
if control_letter.is_ascii_alphabetic() {
|
|
return Some(format!(
|
|
"{}{}{}",
|
|
&e.pattern()[..start],
|
|
((control_letter as u8) % 32) as char,
|
|
&e.pattern()[end + 1..],
|
|
));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
/**
|
|
handles following translations:
|
|
- \d should ascii digits only. so replace with [0-9]
|
|
- \D should match everything but ascii digits. so replace with [^0-9]
|
|
- \w should match ascii letters only. so replace with [a-zA-Z0-9_]
|
|
- \W should match everything but ascii letters. so replace with [^a-zA-Z0-9_]
|
|
- \s and \S differences
|
|
- \a is not an ECMA 262 control escape
|
|
*/
|
|
struct Translator<'a> {
|
|
pat: &'a str,
|
|
out: Option<String>,
|
|
}
|
|
|
|
impl Translator<'_> {
|
|
fn replace(&mut self, span: &Span, with: &str) {
|
|
let (start, end) = (span.start.offset, span.end.offset);
|
|
self.out = Some(format!("{}{with}{}", &self.pat[..start], &self.pat[end..]));
|
|
}
|
|
|
|
fn replace_class_class(&mut self, perl: &ClassPerl) {
|
|
match perl.kind {
|
|
ClassPerlKind::Digit => {
|
|
self.replace(&perl.span, if perl.negated { "[^0-9]" } else { "[0-9]" });
|
|
}
|
|
ClassPerlKind::Word => {
|
|
let with = &if perl.negated {
|
|
"[^A-Za-z0-9_]"
|
|
} else {
|
|
"[A-Za-z0-9_]"
|
|
};
|
|
self.replace(&perl.span, with);
|
|
}
|
|
ClassPerlKind::Space => {
|
|
let with = &if perl.negated {
|
|
"[^ \t\n\r\u{000b}\u{000c}\u{00a0}\u{feff}\u{2003}\u{2029}]"
|
|
} else {
|
|
"[ \t\n\r\u{000b}\u{000c}\u{00a0}\u{feff}\u{2003}\u{2029}]"
|
|
};
|
|
self.replace(&perl.span, with);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Visitor for Translator<'_> {
|
|
type Output = Option<String>;
|
|
type Err = &'static str;
|
|
|
|
fn finish(self) -> Result<Self::Output, Self::Err> {
|
|
Ok(self.out)
|
|
}
|
|
|
|
fn visit_class_set_item_pre(&mut self, ast: &ast::ClassSetItem) -> Result<(), Self::Err> {
|
|
if let ClassSetItem::Perl(perl) = ast {
|
|
self.replace_class_class(perl);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
fn visit_post(&mut self, ast: &Ast) -> Result<(), Self::Err> {
|
|
if self.out.is_some() {
|
|
return Ok(());
|
|
}
|
|
match ast {
|
|
Ast::ClassPerl(perl) => {
|
|
self.replace_class_class(perl);
|
|
}
|
|
Ast::Literal(ref literal) => {
|
|
if let Literal {
|
|
kind: LiteralKind::Special(SpecialLiteralKind::Bell),
|
|
..
|
|
} = literal.as_ref()
|
|
{
|
|
return Err("\\a is not an ECMA 262 control escape");
|
|
}
|
|
}
|
|
_ => (),
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_ecma_compat_valid() {
|
|
// println!("{:#?}", Parser::new().parse(r#"a\a"#));
|
|
let tests = [
|
|
(r"ab\cAcde\cBfg", "ab\u{1}cde\u{2}fg"), // \c{control_letter}
|
|
(r"\\comment", r"\\comment"), // there is no \c
|
|
(r"ab\def", r#"ab[0-9]ef"#), // \d
|
|
(r"ab[a-z\d]ef", r#"ab[a-z[0-9]]ef"#), // \d inside classSet
|
|
(r"ab\Def", r#"ab[^0-9]ef"#), // \d
|
|
(r"ab[a-z\D]ef", r#"ab[a-z[^0-9]]ef"#), // \D inside classSet
|
|
];
|
|
for (input, want) in tests {
|
|
match convert(input) {
|
|
Ok(got) => {
|
|
if got.as_ref() != want {
|
|
panic!("convert({input:?}): got: {got:?}, want: {want:?}");
|
|
}
|
|
}
|
|
Err(e) => {
|
|
panic!("convert({input:?}) failed: {e}");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_ecma_compat_invalid() {
|
|
// println!("{:#?}", Parser::new().parse(r#"a\a"#));
|
|
let tests = [
|
|
r"\c\n", // \c{invalid_char}
|
|
r"abc\adef", // \a is not valid
|
|
];
|
|
for input in tests {
|
|
if convert(input).is_ok() {
|
|
panic!("convert({input:?}) mut fail");
|
|
}
|
|
}
|
|
}
|
|
}
|