Skip to content

[RegexDiff X64] [Copilot] Extract common trailing anchors from regex alterna ... #1839

@MihuBot

Description

@MihuBot

Job completed in 10 minutes 36 seconds (remote runner delay: 0 seconds).
dotnet/runtime#126179
Using arguments: regexdiff
Main commit: dotnet/runtime@638fdf0
PR commit: dotnet/runtime@185abee

109 out of 18857 patterns have generated source code changes.

Examples of GeneratedRegex source diffs
"\\d{5}$|\\d{5}-\\d{4}$" (5703 uses)
[GeneratedRegex("\\d{5}$|\\d{5}-\\d{4}$", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.CultureInvariant)]
  /// Explanation:<br/>
  /// <code>
  /// ○ Match a Unicode digit exactly 5 times.<br/>
-   /// ○ Match with 2 alternative expressions, atomically.<br/>
-   ///     ○ Match if at the end of the string or if before an ending newline.<br/>
-   ///     ○ Match a sequence of expressions.<br/>
-   ///         ○ Match '-'.<br/>
-   ///         ○ Match a Unicode digit exactly 4 times.<br/>
-   ///         ○ Match if at the end of the string or if before an ending newline.<br/>
+   /// ○ Optional (lazy).<br/>
+   ///     ○ Match '-'.<br/>
+   ///     ○ Match a Unicode digit exactly 4 times.<br/>
+   /// ○ Match if at the end of the string or if before an ending newline.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
              {
                  int pos = base.runtextpos;
                  int matchStart = pos;
+                   int lazyloop_iteration = 0;
+                   int stackpos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // Match a Unicode digit exactly 5 times.
                      }
                  }
                  
-                   // Match with 2 alternative expressions, atomically.
+                   // Optional (lazy).
+                   //{
+                       pos += 5;
+                       slice = inputSpan.Slice(pos);
+                       lazyloop_iteration = 0;
+                       goto LazyLoopEnd;
+                       
+                       LazyLoopBody:
+                       Utilities.StackPush(ref base.runstack!, ref stackpos, pos);
+                       lazyloop_iteration++;
+                       
+                       if ((uint)slice.Length < 5 ||
+                           slice[0] != '-' || // Match '-'.
+                           !char.IsDigit(slice[1]) || // Match a Unicode digit exactly 4 times.
+                           !char.IsDigit(slice[2]) ||
+                           !char.IsDigit(slice[3]) ||
+                           !char.IsDigit(slice[4]))
+                       {
+                           goto LazyLoopIterationNoMatch;
+                       }
+                       
+                       pos += 5;
+                       slice = inputSpan.Slice(pos);
+                       goto LazyLoopEnd;
+                       
+                       // The lazy loop iteration failed to match.
+                       LazyLoopIterationNoMatch:
+                       return false; // The input didn't match.
+                       
+                       LazyLoopEnd:
+                       Utilities.StackPush(ref base.runstack!, ref stackpos, pos);
+                       goto LazyLoopSkipBacktrack;
+                       
+                       LazyLoopBacktrack:
+                       if (Utilities.s_hasTimeout)
+                       {
+                           base.CheckTimeout();
+                       }
+                       
+                       pos = base.runstack![--stackpos];
+                       slice = inputSpan.Slice(pos);
+                       
+                       // If the upper bound 1 has already been reached,
+                       // don't continue lazily iterating. Instead, backtrack.
+                       if (lazyloop_iteration != 0)
+                       {
+                           stackpos--;
+                           return false; // The input didn't match.
+                       }
+                       goto LazyLoopBody;
+                       
+                       LazyLoopSkipBacktrack:;
+                   //}
+                   
+                   // Match if at the end of the string or if before an ending newline.
+                   if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n'))
                  {
-                       int alternation_starting_pos = pos;
-                       
-                       // Branch 0
-                       {
-                           // Match if at the end of the string or if before an ending newline.
-                           if (6 < slice.Length || (5 < slice.Length && slice[5] != '\n'))
-                           {
-                               goto AlternationBranch;
-                           }
-                           
-                           pos += 5;
-                           slice = inputSpan.Slice(pos);
-                           goto AlternationMatch;
-                           
-                           AlternationBranch:
-                           pos = alternation_starting_pos;
-                           slice = inputSpan.Slice(pos);
-                       }
-                       
-                       // Branch 1
-                       {
-                           if ((uint)slice.Length < 10 ||
-                               slice[5] != '-' || // Match '-'.
-                               !char.IsDigit(slice[6]) || // Match a Unicode digit exactly 4 times.
-                               !char.IsDigit(slice[7]) ||
-                               !char.IsDigit(slice[8]) ||
-                               !char.IsDigit(slice[9]))
-                           {
-                               return false; // The input didn't match.
-                           }
-                           
-                           // Match if at the end of the string or if before an ending newline.
-                           if (11 < slice.Length || (10 < slice.Length && slice[10] != '\n'))
-                           {
-                               return false; // The input didn't match.
-                           }
-                           
-                           pos += 10;
-                           slice = inputSpan.Slice(pos);
-                       }
-                       
-                       AlternationMatch:;
+                       goto LazyLoopBacktrack;
                  }
                  
                  // The input matched.
          return -1;
      }
      
+       /// <summary>Pushes 1 value onto the backtracking stack.</summary>
+       [MethodImpl(MethodImplOptions.AggressiveInlining)]
+       internal static void StackPush(ref int[] stack, ref int pos, int arg0)
+       {
+           // If there's space available for the value, store it.
+           int[] s = stack;
+           int p = pos;
+           if ((uint)p < (uint)s.Length)
+           {
+               s[p] = arg0;
+               pos++;
+               return;
+           }
+       
+           // Otherwise, resize the stack to make room and try again.
+           WithResize(ref stack, ref pos, arg0);
+       
+           // <summary>Resize the backtracking stack array and push 1 value onto the stack.</summary>
+           [MethodImpl(MethodImplOptions.NoInlining)]
+           static void WithResize(ref int[] stack, ref int pos, int arg0)
+           {
+               Array.Resize(ref stack, (pos + 0) * 2);
+               StackPush(ref stack, ref pos, arg0);
+           }
+       }
+       
      /// <summary>Supports searching for characters in or not in "\0\u0001\u0002\u0003\u0004\u0005\u0006\a\b\t\n\v\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f !\"#$%&amp;'()*+,-./:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f".</summary>
      internal static readonly SearchValues<char> s_asciiExceptDigits = SearchValues.Create("\0\u0001\u0002\u0003\u0004\u0005\u0006\a\b\t\n\v\f\r\u000e\u000f\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f !\"#$%&'()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f");
  }
"^\\d+$|^\\-?\\d*\\.\\d*$" (499 uses)
[GeneratedRegex("^\\d+$|^\\-?\\d*\\.\\d*$")]
  /// Explanation:<br/>
  /// <code>
  /// ○ Match if at the beginning of the string.<br/>
-   /// ○ Match with 2 alternative expressions, atomically.<br/>
-   ///     ○ Match a sequence of expressions.<br/>
-   ///         ○ Match a Unicode digit atomically at least once.<br/>
-   ///         ○ Match if at the end of the string or if before an ending newline.<br/>
+   /// ○ Match with 2 alternative expressions.<br/>
+   ///     ○ Match a Unicode digit atomically at least once.<br/>
  ///     ○ Match a sequence of expressions.<br/>
  ///         ○ Match '-' atomically, optionally.<br/>
  ///         ○ Match a Unicode digit atomically any number of times.<br/>
  ///         ○ Match '.'.<br/>
  ///         ○ Match a Unicode digit atomically any number of times.<br/>
-   ///         ○ Match if at the end of the string or if before an ending newline.<br/>
+   /// ○ Match if at the end of the string or if before an ending newline.<br/>
  /// </code>
  /// </remarks>
  [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
              {
                  int pos = base.runtextpos;
                  int matchStart = pos;
+                   int alternation_branch = 0;
+                   int alternation_starting_pos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // Match if at the beginning of the string.
                      return false; // The input didn't match.
                  }
                  
-                   // Match with 2 alternative expressions, atomically.
-                   {
-                       int alternation_starting_pos = pos;
+                   // Match with 2 alternative expressions.
+                   //{
+                       alternation_starting_pos = pos;
                      
                      // Branch 0
-                       {
+                       //{
                          // Match a Unicode digit atomically at least once.
                          {
                              int iteration = 0;
                              pos += iteration;
                          }
                          
-                           // Match if at the end of the string or if before an ending newline.
-                           if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n'))
-                           {
-                               goto AlternationBranch;
-                           }
-                           
+                           alternation_branch = 0;
                          goto AlternationMatch;
                          
                          AlternationBranch:
                          pos = alternation_starting_pos;
                          slice = inputSpan.Slice(pos);
-                       }
+                       //}
                      
                      // Branch 1
-                       {
+                       //{
                          // Match '-' atomically, optionally.
                          {
                              if (!slice.IsEmpty && slice[0] == '-')
                              pos += iteration2;
                          }
                          
-                           // Match if at the end of the string or if before an ending newline.
-                           if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n'))
-                           {
+                           alternation_branch = 1;
+                           goto AlternationMatch;
+                       //}
+                       
+                       AlternationBacktrack:
+                       if (Utilities.s_hasTimeout)
+                       {
+                           base.CheckTimeout();
+                       }
+                       
+                       switch (alternation_branch)
+                       {
+                           case 0:
+                               goto AlternationBranch;
+                           case 1:
                              return false; // The input didn't match.
-                           }
-                           
                      }
                      
                      AlternationMatch:;
+                   //}
+                   
+                   // Match if at the end of the string or if before an ending newline.
+                   if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n'))
+                   {
+                       goto AlternationBacktrack;
                  }
                  
                  // The input matched.
"(h|am\\b|a\\.m\\.|a m\\b|a\\. m\\.|a\\.m\\b| ..." (209 uses)
[GeneratedRegex("(h|am\\b|a\\.m\\.|a m\\b|a\\. m\\.|a\\.m\\b|a\\. m\\b)", RegexOptions.IgnoreCase | RegexOptions.Singleline)]
  ///                     ○ Match if at a word boundary.<br/>
  ///                 ○ Match a sequence of expressions.<br/>
  ///                     ○ Match '.'.<br/>
-   ///                     ○ Match with 4 alternative expressions, atomically.<br/>
+   ///                     ○ Match with 3 alternative expressions, atomically.<br/>
  ///                         ○ Match a sequence of expressions.<br/>
  ///                             ○ Match a character in the set [Mm].<br/>
  ///                             ○ Match '.'.<br/>
  ///                             ○ Match a character in the set [Mm].<br/>
  ///                             ○ Match '.'.<br/>
  ///                         ○ Match a sequence of expressions.<br/>
-   ///                             ○ Match a character in the set [Mm].<br/>
-   ///                             ○ Match if at a word boundary.<br/>
-   ///                         ○ Match a sequence of expressions.<br/>
-   ///                             ○ Match ' '.<br/>
-   ///                             ○ Match a character in the set [Mm].<br/>
+   ///                             ○ Match with 2 alternative expressions.<br/>
+   ///                                 ○ Match a character in the set [Mm].<br/>
+   ///                                 ○ Match a sequence of expressions.<br/>
+   ///                                     ○ Match ' '.<br/>
+   ///                                     ○ Match a character in the set [Mm].<br/>
  ///                             ○ Match if at a word boundary.<br/>
  ///                 ○ Match a sequence of expressions.<br/>
  ///                     ○ Match ' '.<br/>
                  int pos = base.runtextpos;
                  int matchStart = pos;
                  int capture_starting_pos = 0;
+                   int stackpos = 0;
                  ReadOnlySpan<char> slice = inputSpan.Slice(pos);
                  
                  // 1st capture group.
                                              
                                          case '.':
                                              
-                                               // Match with 4 alternative expressions, atomically.
+                                               // Atomic group.
                                              {
-                                                   int alternation_starting_pos = pos;
+                                                   int atomic_stackpos = stackpos;
                                                  
-                                                   // Branch 0
-                                                   {
-                                                       if ((uint)slice.Length < 4 ||
-                                                           !slice.Slice(2).StartsWith("m.", StringComparison.OrdinalIgnoreCase)) // Match the string "m." (ordinal case-insensitive)
+                                                   // Match with 3 alternative expressions, atomically.
+                                                   //{
+                                                       int alternation_starting_pos = pos;
+                                                       
+                                                       // Branch 0
                                                      {
-                                                           goto AlternationBranch;
+                                                           if ((uint)slice.Length < 4 ||
+                                                               !slice.Slice(2).StartsWith("m.", StringComparison.OrdinalIgnoreCase)) // Match the string "m." (ordinal case-insensitive)
+                                                           {
+                                                               goto AlternationBranch;
+                                                           }
+                                                           
+                                                           pos += 4;
+                                                           slice = inputSpan.Slice(pos);
+                                                           goto AlternationMatch;
+                                                           
+                                                           AlternationBranch:
+                                                           pos = alternation_starting_pos;
+                                                           slice = inputSpan.Slice(pos);
                                                      }
                                                      
-                                                       pos += 4;
-                                                       slice = inputSpan.Slice(pos);
-                                                       goto AlternationMatch;
+                                                       // Branch 1
+                                                       {
+                                                           if ((uint)slice.Length < 5 ||
+                                                               !slice.Slice(2).StartsWith(" m.", StringComparison.OrdinalIgnoreCase)) // Match the string " m." (ordinal case-insensitive)
+                                                           {
+                                                               goto AlternationBranch1;
+                                                           }
+                                                           
+                                                           pos += 5;
+                                                           slice = inputSpan.Slice(pos);
+                                                           goto AlternationMatch;
+                                                           
+                                                           AlternationBranch1:
+                                                           pos = alternation_starting_pos;
+                                                           slice = inputSpan.Slice(pos);
+                                                       }
                                                      
-                                                       AlternationBranch:
-                                                       pos = alternation_starting_pos;
-                                                       slice = inputSpan.Slice(pos);
-                                                   }
+                                                       // Branch 2
+                                                       {
+                                                           // Match with 2 alternative expressions.
+                                                           //{
+                                                               if ((uint)slice.Length < 3)
+                                                               {
+                                                                   UncaptureUntil(0);
+                                                                   return false; // The input didn't match.
+                                                               }
+                                                               
+                                                               switch (slice[2])
+                                                               {
+                                                                   case 'M' or 'm':
+                                                                       pos += 3;
+                                                                       slice = inputSpan.Slice(pos);
+                                                                       break;
+                                                                       
+                                                                   case ' ':
+                                                                       
+                                                                       // Match a character in the set [Mm].
+                                                                       if ((uint)slice.Length < 4 || ((slice[3] | 0x20) != 'm'))
+                                                                       {
+                                                                           UncaptureUntil(0);
+                                                                           return false; // The input didn't match.
+                                                                       }
+                                                                       
+                                                                       pos += 4;
+                                                                       slice = inputSpan.Slice(pos);
+                                                                       break;
+                                                                       
+                                                                   default:
+                                                                       UncaptureUntil(0);
+                                                                       return false; // The input didn't match.
+                                                               }
+                                                           //}
+                                                           
+                                                           // Match if at a word boundary.
+                                                           if (!Utilities.IsPostWordCharBoundary(inputSpan, pos))
+                                                           {
+                                                               UncaptureUntil(0);
+                                                               return false; // The input didn't match.
+                                                           }
+                                                           
+                                                       }
+                                                       
+                                                       AlternationMatch:;
+                                                   //}
                                                  
-                                                   // Branch 1
-                                                   {
-                                                       if ((uint)slice.Length < 5 ||
-                                                           !slice.Slice(2).StartsWith(" m.", StringComparison.OrdinalIgnoreCase)) // Match the string " m." (ordinal case-insensitive)
-                                                       {
-                                                           goto AlternationBranch1;
-                                                       }
-                                                       
-                                                       pos += 5;
-                                                       slice = inputSpan.Slice(pos);
-                                                       goto AlternationMatch;
-                                                       
-                                                       AlternationBranch1:
-                                                       pos = alternation_starting_pos;
-                                                       slice = inputSpan.Slice(pos);
-                                                   }
-                                                   
-                                                   // Branch 2
-                                                   {
-                                                       // Match a character in the set [Mm].
-                                                       if ((uint)slice.Length < 3 || ((slice[2] | 0x20) != 'm'))
-                                                       {
-                                                           goto AlternationBranch2;
-                                                       }
-                                                       
-                                                       // Match if at a word boundary.
-                                                       if (!Utilities.IsPostWordCharBoundary(inputSpan, pos + 3))
-                                                       {
-                                                           goto AlternationBranch2;
-                                                       }
-                                                       
-                                                       pos += 3;
-                                                       slice = inputSpan.Slice(pos);
-                                                       goto AlternationMatch;
-                                                       
-                                                       AlternationBranch2:
-                                                       pos = alternation_starting_pos;
-                                                       slice = inputSpan.Slice(pos);
-                                                   }
-                                                   
-                                                   // Branch 3
-                                                   {
-                                                       if ((uint)slice.Length < 4 ||
-                                                           !slice.Slice(2).StartsWith(" m", StringComparison.OrdinalIgnoreCase)) // Match the string " m" (ordinal case-insensitive)
-                                                       {
-                                                           UncaptureUntil(0);
-                                                           return false; // The input didn't match.
-                                                       }
-                                                       
-                                                       // Match if at a word boundary.
-                                                       if (!Utilities.IsPostWordCharBoundary(inputSpan, pos + 4))
-                                                       {
-                                                           UncaptureUntil(0);
-                                                           return false; // The input didn't match.
-                                                       }
-                                                       
-                                                       pos += 4;
-                                                       slice = inputSpan.Slice(pos);
-                                                   }
-                                                   
-                                                   AlternationMatch:;
+                                                   stackpos = atomic_stackpos;
                                              }
                                              
                                              break;

For more diff examples, see https://gist.github.com/MihuBot/4d73324684540f4ecef28477dedd383d

Sample source code for further analysis
const string JsonPath = "RegexResults-1839.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/FKKoIJCA");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

Artifacts:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions