@@ -580,9 +580,53 @@ let scan_regex scanner =
580
580
bring_buf_up_to_date ~start_offset: last_char_offset;
581
581
Buffer. contents buf)
582
582
in
583
- let rec scan () =
583
+ (* Look ahead from a given absolute offset to see if a valid class closer
584
+ exists on the same line.
585
+ Semantics:
586
+ - Applies BOS rules: an initial '^' does not count as content; the
587
+ very first ']' after '[' or after '[^' is treated as literal.
588
+ - Skips escaped characters (\\.) while scanning.
589
+ - Returns true only if a subsequent unescaped ']' (after some content)
590
+ is found before a line break or EOF. *)
591
+ let has_valid_class_closer_ahead ~from_offset =
592
+ let src = scanner.src in
593
+ let len = String. length src in
594
+ let i = ref (from_offset + 1 ) in
595
+ (* start scanning after current '[' *)
596
+ let bos = ref true in
597
+ let rec loop () =
598
+ if ! i > = len then false
599
+ else
600
+ match String. unsafe_get src ! i with
601
+ | '\n' | '\r' -> false
602
+ | '\\' ->
603
+ if ! i + 1 < len then (
604
+ i := ! i + 2 ;
605
+ loop () )
606
+ else false
607
+ | '^' when ! bos ->
608
+ incr i;
609
+ loop ()
610
+ | ']' when ! bos ->
611
+ (* Leading ']' is literal content; after that, we're no longer at BOS. *)
612
+ bos := false ;
613
+ incr i;
614
+ loop ()
615
+ | ']' -> true
616
+ | _ ->
617
+ bos := false ;
618
+ incr i;
619
+ loop ()
620
+ in
621
+ loop ()
622
+ in
623
+
624
+ (* Scan until closing '/' that is not inside a character class. Only enter
625
+ character-class mode when a valid ']' is present ahead (same line).
626
+ Track beginning-of-class to allow a leading ']' (or leading '^' then ']'). *)
627
+ let rec scan ~in_class ~class_at_bos =
584
628
match scanner.ch with
585
- | '/' ->
629
+ | '/' when not in_class ->
586
630
let last_char_offset = scanner.offset in
587
631
next scanner;
588
632
let pattern = result ~first_char_offset ~last_char_offset in
@@ -606,12 +650,34 @@ let scan_regex scanner =
606
650
| '\\' ->
607
651
next scanner;
608
652
next scanner;
609
- scan ()
653
+ (* Escapes count as content when inside a class; clear BOS. *)
654
+ scan ~in_class ~class_at_bos: (if in_class then false else class_at_bos)
655
+ | '[' when not in_class ->
656
+ (* Only enter a character class if a closing ']' exists ahead on the
657
+ same line. Otherwise treat '[' as a normal char. *)
658
+ if has_valid_class_closer_ahead ~from_offset: scanner.offset then (
659
+ next scanner;
660
+ scan ~in_class: true ~class_at_bos: true )
661
+ else (
662
+ next scanner;
663
+ scan ~in_class ~class_at_bos )
664
+ | '^' when in_class && class_at_bos ->
665
+ (* Leading caret does not count as content. *)
666
+ next scanner;
667
+ scan ~in_class ~class_at_bos: true
668
+ | ']' when in_class && class_at_bos ->
669
+ (* First ']' after '[' or '[^' is literal, not a closer. *)
670
+ next scanner;
671
+ scan ~in_class ~class_at_bos: false
672
+ | ']' when in_class ->
673
+ (* Leave character class. *)
674
+ next scanner;
675
+ scan ~in_class: false ~class_at_bos: false
610
676
| _ ->
611
677
next scanner;
612
- scan ( )
678
+ scan ~in_class ~class_at_bos: ( if in_class then false else class_at_bos )
613
679
in
614
- let pattern, flags = scan () in
680
+ let pattern, flags = scan ~in_class: false ~class_at_bos: false in
615
681
let end_pos = position scanner in
616
682
(start_pos, end_pos, Token. Regex (pattern, flags))
617
683
0 commit comments