Skip to content

Commit f2d3e4c

Browse files
committed
fix JS regex literal parsing in char classes
1 parent 3931a96 commit f2d3e4c

File tree

3 files changed

+93
-6
lines changed

3 files changed

+93
-6
lines changed

compiler/syntax/src/res_scanner.ml

Lines changed: 71 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -580,9 +580,53 @@ let scan_regex scanner =
580580
bring_buf_up_to_date ~start_offset:last_char_offset;
581581
Buffer.contents buf)
582582
in
583-
let rec scan () =
583+
(* Look ahead from a given absolute offset to see if a valid class closer
584+
exists on the same line.
585+
Semantics:
586+
- Applies BOS rules: an initial '^' does not count as content; the
587+
very first ']' after '[' or after '[^' is treated as literal.
588+
- Skips escaped characters (\\.) while scanning.
589+
- Returns true only if a subsequent unescaped ']' (after some content)
590+
is found before a line break or EOF. *)
591+
let has_valid_class_closer_ahead ~from_offset =
592+
let src = scanner.src in
593+
let len = String.length src in
594+
let i = ref (from_offset + 1) in
595+
(* start scanning after current '[' *)
596+
let bos = ref true in
597+
let rec loop () =
598+
if !i >= len then false
599+
else
600+
match String.unsafe_get src !i with
601+
| '\n' | '\r' -> false
602+
| '\\' ->
603+
if !i + 1 < len then (
604+
i := !i + 2;
605+
loop ())
606+
else false
607+
| '^' when !bos ->
608+
incr i;
609+
loop ()
610+
| ']' when !bos ->
611+
(* Leading ']' is literal content; after that, we're no longer at BOS. *)
612+
bos := false;
613+
incr i;
614+
loop ()
615+
| ']' -> true
616+
| _ ->
617+
bos := false;
618+
incr i;
619+
loop ()
620+
in
621+
loop ()
622+
in
623+
624+
(* Scan until closing '/' that is not inside a character class. Only enter
625+
character-class mode when a valid ']' is present ahead (same line).
626+
Track beginning-of-class to allow a leading ']' (or leading '^' then ']'). *)
627+
let rec scan ~in_class ~class_at_bos =
584628
match scanner.ch with
585-
| '/' ->
629+
| '/' when not in_class ->
586630
let last_char_offset = scanner.offset in
587631
next scanner;
588632
let pattern = result ~first_char_offset ~last_char_offset in
@@ -606,12 +650,34 @@ let scan_regex scanner =
606650
| '\\' ->
607651
next scanner;
608652
next scanner;
609-
scan ()
653+
(* Escapes count as content when inside a class; clear BOS. *)
654+
scan ~in_class ~class_at_bos:(if in_class then false else class_at_bos)
655+
| '[' when not in_class ->
656+
(* Only enter a character class if a closing ']' exists ahead on the
657+
same line. Otherwise treat '[' as a normal char. *)
658+
if has_valid_class_closer_ahead ~from_offset:scanner.offset then (
659+
next scanner;
660+
scan ~in_class:true ~class_at_bos:true)
661+
else (
662+
next scanner;
663+
scan ~in_class ~class_at_bos)
664+
| '^' when in_class && class_at_bos ->
665+
(* Leading caret does not count as content. *)
666+
next scanner;
667+
scan ~in_class ~class_at_bos:true
668+
| ']' when in_class && class_at_bos ->
669+
(* First ']' after '[' or '[^' is literal, not a closer. *)
670+
next scanner;
671+
scan ~in_class ~class_at_bos:false
672+
| ']' when in_class ->
673+
(* Leave character class. *)
674+
next scanner;
675+
scan ~in_class:false ~class_at_bos:false
610676
| _ ->
611677
next scanner;
612-
scan ()
678+
scan ~in_class ~class_at_bos:(if in_class then false else class_at_bos)
613679
in
614-
let pattern, flags = scan () in
680+
let pattern, flags = scan ~in_class:false ~class_at_bos:false in
615681
let end_pos = position scanner in
616682
(start_pos, end_pos, Token.Regex (pattern, flags))
617683

tests/syntax_tests/data/parsing/grammar/expressions/expected/regex.res.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -499,4 +499,12 @@ let re = [%re {js|/^a*?$/|js}]
499499
let re = [%re {js|/^((a)c)?(ab)$/|js}]
500500
let re = [%re {js|/^([ab]*?)(?=(b)?)c/|js}]
501501
let re = [%re {js|/^([ab]*?)(?!(b))c/|js}]
502-
let re = [%re {js|/^([ab]*?)(?<!(a))c/|js}]
502+
let re = [%re {js|/^([ab]*?)(?<!(a))c/|js}]
503+
let re = [%re {js|/\.[^/.]+$/|js}]
504+
let re = [%re {js|/[]/]/|js}]
505+
let re = [%re {js|/[^]]/|js}]
506+
let re = [%re {js|/[/]/|js}]
507+
let re = [%re {js|/[]]/|js}]
508+
let re = [%re {js|/[\]]/|js}]
509+
let re = [%re {js|/[[]]/|js}]
510+
let re = [%re {js|/[^]/]/|js}]

tests/syntax_tests/data/parsing/grammar/expressions/regex.res

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,3 +607,16 @@ let re = /^((a)c)?(ab)$/
607607
let re = /^([ab]*?)(?=(b)?)c/
608608
let re = /^([ab]*?)(?!(b))c/
609609
let re = /^([ab]*?)(?<!(a))c/
610+
611+
let re = /\.[^/.]+$/
612+
613+
// Leading ']' is literal; '/' inside class must not terminate
614+
let re = /[]/]/
615+
let re = /[^]]/
616+
let re = /[/]/
617+
618+
// Additional leading ']' edge cases
619+
let re = /[]]/
620+
let re = /[\]]/
621+
let re = /[[]]/
622+
let re = /[^]/]/

0 commit comments

Comments
 (0)