Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 46 additions & 5 deletions src/framework/mpas_stream_list.F
Original file line number Diff line number Diff line change
Expand Up @@ -223,9 +223,18 @@ end subroutine MPAS_stream_list_remove !}}}
!> will return .false.
!
!-----------------------------------------------------------------------

! Optimized; changes from original:
! 1. Pattern is compiled ONCE before the node loop (not once per node).
! 2. Compilation uses the cross-call cache in regex_matching.c, so each
! unique pattern string is compiled at most once for the entire run.
! 3. regex_free_cached is a no-op (cache owns lifetime) but is called at
! every exit point for forward-compatibility and readability.
! 4. Three interface blocks replace the single check_regex_match interface.

logical function MPAS_stream_list_query(list, streamPattern, stream, ierr) result(found) !{{{

use iso_c_binding, only: c_char, c_int
use iso_c_binding, only: c_char, c_int, c_intptr_t
use mpas_c_interfacing, only : mpas_f_to_c_string, mpas_c_to_f_string

implicit none
Expand All @@ -237,22 +246,47 @@ logical function MPAS_stream_list_query(list, streamPattern, stream, ierr) resul

character(kind=c_char), dimension(StrKIND+1) :: c_pattern
character(kind=c_char), dimension(StrKIND+1) :: c_test_string
integer(kind=c_int) :: c_match
integer(kind=c_int) :: c_match ! 1=match, 0=no match, -1=error
integer(kind=c_int) :: c_compile_err ! 0=ok, -1=error
integer(kind=c_intptr_t) :: c_regex_handle ! opaque cache index

type (MPAS_stream_list_type), pointer :: node

interface
! Legacy function for backward compatibility
subroutine check_regex_match(pattern, string, imatch) bind(c)
use iso_c_binding, only: c_char, c_int
character(kind=c_char), dimension(*), intent(in) :: pattern, string
integer(kind=c_int), intent(out) :: imatch
end subroutine check_regex_match

! Compile (or retrieve from cache) a regex pattern.
! Returns an opaque handle used by regex_exec_cached.
subroutine regex_compile_cached(pattern, handle, ierr_out) bind(c)
use iso_c_binding, only : c_char, c_int, c_intptr_t
character(kind=c_char), dimension(*), intent(in) :: pattern
integer(kind=c_intptr_t), intent(out) :: handle
integer(kind=c_int), intent(out) :: ierr_out
end subroutine regex_compile_cached

! Test `string` against the pre-compiled pattern identified by `handle`.
subroutine regex_exec_cached(handle, string, imatch) bind(c)
use iso_c_binding, only : c_char, c_int, c_intptr_t
integer(kind=c_intptr_t), intent(inout) :: handle
character(kind=c_char), dimension(*), intent(in) :: string
integer(kind=c_int), intent(out) :: imatch
end subroutine regex_exec_cached

! Release the handle (no-op when using the cache; included for
! forward-compatibility and to clearly mark every exit point).
subroutine regex_free_cached(handle) bind(c)
use iso_c_binding, only : c_intptr_t
integer(kind=c_intptr_t), intent(inout) :: handle
end subroutine regex_free_cached
end interface

LIST_DEBUG_WRITE(' -- Called MPAS_stream_list_query()')

call mpas_f_to_c_string(streamPattern, c_pattern)

if (present(ierr)) ierr = MPAS_STREAM_LIST_NOERR
found = .false.

Expand All @@ -263,6 +297,10 @@ end subroutine check_regex_match
return
end if

! Compile the pattern ONCE, before entering the node loop
call mpas_f_to_c_string(streamPattern, c_pattern)
call regex_compile_cached(c_pattern, c_regex_handle, c_compile_err)

if ( associated(stream) ) then
if ( associated(stream % next) ) then
node => stream % next
Expand All @@ -276,7 +314,7 @@ end subroutine check_regex_match
do while (associated(node))
call mpas_f_to_c_string(node % name, c_test_string)

call check_regex_match(c_pattern, c_test_string, c_match)
call regex_exec_cached(c_regex_handle, c_test_string, c_match)

if ( c_match == -1 ) then
call mpas_log_write('Regular expression matching failed.', MPAS_LOG_ERR)
Expand All @@ -287,11 +325,14 @@ end subroutine check_regex_match
if ( c_match == 1 ) then
found = .true.
stream => node
call regex_free_cached(c_regex_handle) ! no-op, but marks exit
return
end if
node => node % next
end do

call regex_free_cached(c_regex_handle) ! no-op, but marks exit

LIST_DEBUG_WRITE(' -- No items matching '//trim(streamPattern)//' found in list.')
nullify(stream)

Expand Down
260 changes: 229 additions & 31 deletions src/framework/regex_matching.c
Original file line number Diff line number Diff line change
@@ -1,37 +1,235 @@
/*
* regex_matching.c
*
* Provides regex pattern matching for MPAS stream list queries.
*
* Optimizations over the original:
* 1. Compile-once-per-query : regex compiled once per call, not per node.
* 2. Cross-call pattern cache: each unique pattern is compiled at most once
* for the entire model run (cache persists across
* timesteps and repeated calls).
* 3. Plain-string fast path : patterns with no regex metacharacters are
* matched with strcmp(), bypassing the regex
* engine entirely.
*
* Public API (called from Fortran via iso_c_binding):
* regex_compile_cached(pattern, handle, ierr_out)
* regex_exec_cached (handle, string, imatch)
* regex_free_cached (handle) -- no-op; cache owns lifetime
*
* Legacy entry point (kept for any other callers):
* check_regex_match(pattern, string, imatch)
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <regex.h>

#define MAX_LEN 1024

void check_regex_match(const char * pattern, const char * str, int *imatch){
regex_t regex;
char bracketed_pattern[MAX_LEN];
int ierr, len;

*imatch = 0;
len = snprintf(bracketed_pattern, 1024, "^%s$", pattern);
if ( len >= MAX_LEN ) {
*imatch = -1;
return;
}

ierr = regcomp(&regex, bracketed_pattern, 0);
if ( ierr ) {
*imatch = -1;
return;
}

ierr = regexec(&regex, str, 0, NULL, 0);

regfree(&regex);

if ( !ierr ) {
*imatch = 1;
} else if ( ierr == REG_NOMATCH ) {
*imatch = 0;
} else {
*imatch = -1;
}
/* ── Tuneable constants ──────────────────────────────────────────────────── */
#define MAX_LEN 1024 /* max length of a bracketed pattern string */
#define CACHE_SIZE 64 /* max number of unique patterns held in cache;
MPAS typically uses far fewer than this. */

/* ── Regex metacharacter set ─────────────────────────────────────────────── */
static const char *REGEX_META = ".*+?[]{}()\\|^$";

/* ── Cache entry ─────────────────────────────────────────────────────────── */
typedef struct {
char bracketed[MAX_LEN]; /* "^<pattern>$" — used as the cache key */
char plain[MAX_LEN]; /* original pattern, only valid when is_plain */
regex_t regex; /* compiled regex, only valid when !is_plain */
int is_plain; /* 1 → use strcmp fast path, 0 → use regex */
int valid; /* 1 → entry is populated and ready to use */
} regex_cache_entry_t;

static regex_cache_entry_t cache[CACHE_SIZE];
static int cache_count = 0;

/* ── Internal helpers ────────────────────────────────────────────────────── */

/*
* is_plain_string
* Returns 1 if `pattern` contains no POSIX regex metacharacters,
* meaning a simple strcmp is sufficient to match it.
*/
static int is_plain_string(const char *pattern)
{
for (; *pattern != '\0'; pattern++) {
if (strchr(REGEX_META, *pattern) != NULL) {
return 0;
}
}
return 1;
}

/*
* cache_get_or_compile
*
* Looks up `bracketed_pattern` in the cache. On a miss, compiles it and
* stores the result. Returns the cache index on success, or -1 on error.
*
* Eviction policy: when the cache is full the oldest entry (index 0) is
* evicted and the array is shifted left so the newest entry occupies the
* highest slot. In practice MPAS stream patterns are fixed at startup so
* the cache should never fill.
*/
static int cache_get_or_compile(const char *bracketed_pattern,
const char *plain_pattern)
{
int i, idx;

/* 1. Search for a cache hit ------------------------------------------ */
for (i = 0; i < cache_count; i++) {
if (cache[i].valid &&
strncmp(cache[i].bracketed, bracketed_pattern, MAX_LEN) == 0) {
return i; /* hit — no compilation needed */
}
}

/* 2. Cache miss -------------------------------------------------------- */
if (cache_count >= CACHE_SIZE) {
/* Evict the oldest entry to make room */
if (!cache[0].is_plain) {
regfree(&cache[0].regex);
}
memmove(&cache[0], &cache[1],
(CACHE_SIZE - 1) * sizeof(regex_cache_entry_t));
cache_count = CACHE_SIZE - 1;
}

idx = cache_count;

/* Populate key fields */
strncpy(cache[idx].bracketed, bracketed_pattern, MAX_LEN - 1);
cache[idx].bracketed[MAX_LEN - 1] = '\0';
cache[idx].valid = 0;
cache[idx].is_plain = is_plain_string(plain_pattern);

if (cache[idx].is_plain) {
/* Fast path: store the plain pattern for strcmp */
strncpy(cache[idx].plain, plain_pattern, MAX_LEN - 1);
cache[idx].plain[MAX_LEN - 1] = '\0';
} else {
/* Full regex compilation */
if (regcomp(&cache[idx].regex, bracketed_pattern, 0) != 0) {
return -1; /* compile error */
}
}

cache[idx].valid = 1;
cache_count++;
return idx;
}

/* ── Public API ──────────────────────────────────────────────────────────── */

/*
* regex_compile_cached
*
* Looks up or compiles `pattern` (a null-terminated C string) and returns
* an opaque integer handle for use with regex_exec_cached / regex_free_cached.
*
* Arguments:
* pattern [in] : plain pattern string (without ^ and $)
* handle [out] : opaque cache index; pass to regex_exec_cached
* ierr_out [out] : 0 on success, -1 on error
*/
void regex_compile_cached(const char *pattern,
intptr_t *handle,
int *ierr_out)
{
char bracketed[MAX_LEN];
int len, idx;

*handle = -1;
*ierr_out = 0;

len = snprintf(bracketed, MAX_LEN, "^%s$", pattern);
if (len < 0 || len >= MAX_LEN) {
*ierr_out = -1;
return;
}

idx = cache_get_or_compile(bracketed, pattern);
if (idx < 0) {
*ierr_out = -1;
return;
}

*handle = (intptr_t)idx;
}

/*
* regex_exec_cached
*
* Tests whether `str` matches the pattern identified by `handle`.
*
* Arguments:
* handle [in] : opaque handle from regex_compile_cached
* str [in] : null-terminated C string to test
* imatch [out] : 1 = match, 0 = no match, -1 = error
*/
void regex_exec_cached(intptr_t *handle,
const char *str,
int *imatch)
{
int idx, ierr;

*imatch = 0;
idx = (int)(*handle);

if (idx < 0 || idx >= cache_count || !cache[idx].valid) {
*imatch = -1;
return;
}

if (cache[idx].is_plain) {
/* Fast path: plain-string comparison, no regex overhead */
*imatch = (strcmp(cache[idx].plain, str) == 0) ? 1 : 0;
return;
}

/* Full regex execution */
ierr = regexec(&cache[idx].regex, str, 0, NULL, 0);
if (!ierr) *imatch = 1;
else if (ierr == REG_NOMATCH) *imatch = 0;
else *imatch = -1;
}

/*
* regex_free_cached
*
* No-op: the cache owns the lifetime of compiled patterns.
* Present so Fortran callers need not change their call sites if the
* underlying strategy ever changes.
*/
void regex_free_cached(intptr_t *handle)
{
*handle = -1;
}

/* ── Legacy entry point ──────────────────────────────────────────────────── */

/*
* check_regex_match (original interface — kept for backward compatibility)
*
* Compiles and executes the regex in a single call. Any code that still
* calls this function will now benefit from the cache transparently.
*/
void check_regex_match(const char *pattern,
const char *str,
int *imatch)
{
intptr_t handle;
int ierr;

regex_compile_cached(pattern, &handle, &ierr);
if (ierr) {
*imatch = -1;
return;
}
regex_exec_cached(&handle, str, imatch);
regex_free_cached(&handle);
}