MITLibraries · jazairi · May 4, 2026 · Apr 23, 2026 · Apr 27, 2026 · Apr 28, 2026
diff --git a/.env.test b/.env.test
@@ -2,7 +2,8 @@ EMAIL_FROM=fake@example.com
 EMAIL_URL_HOST=localhost:3000
 JWT_SECRET_KEY=3862fc949629030de4259b88f6e8f7c3702b2fabfc68d00d46fb7f9f70110690b526997ef4d77765ffa010d8aba440286af39947d0c85287174d99be2db14987
 OPENSEARCH_INDEX=all-current
+AWS_ENDPOINT_URL_LAMBDA=http://localhost:9200
 AWS_ACCESS_KEY_ID=test-key
 AWS_SECRET_ACCESS_KEY=test-secret
 AWS_REGION=us-east-1
-TIMDEX_SEMANTIC_BUILDER_FUNCTION_NAME=timdex-semantic-builder-test
+TIMDEX_SEMANTIC_BUILDER_FUNCTION_NAME=timdex-semantic-builder-prod:live
diff --git a/README.md b/README.md
@@ -85,7 +85,8 @@ end
 
 - Run your test(s). You may receive VCR errors as the `opensearch init` cassette does not have the HTTP transaction you are requesting. However, the nested cassette for your test will generate if it does not exist yet and on future runs these errors will not recur.
 - Manually confirm the headers do not have sensitive information. This scrubbing process should work, but it is your responsibility to ensure you are not committing secrets to code repositories. If you aren't sure, ask.
-- You have to remove or comment out AWS credentials from `.env` before re-running your test or the tests will fail (i.e. this process can only generate cassettes, it can not re-run them with AWS credentials as we scrub the AWS bits from the cassette so VCR does not match)
+- You have to remove or comment out AWS credentials from `.env` before re-running your test or the tests will fail (i.e. this process can only generate cassettes, it can not re-run them with AWS credentials as we scrub the AWS bits from the cassette so VCR does not match).
+- You have to set `AWS_ENDPOINT_URL_LAMBDA` to `http://localhost:9200` in `.env.test` before re-running tests. This is necessary because the initial call uses the default value set by the AWS SDK but subsequent calls use the filtered value.
 
 > [!Important]
 > We re-use OpenSearch connections, which is handled by the nesting of cassettes (see above). If you have sporadically failing tests, ensure you are nesting your test specific cassette inside of the `opensearch init` cassette.

diff --git a/app/graphql/types/query_type.rb b/app/graphql/types/query_type.rb
@@ -67,7 +67,7 @@ def record_id(id:, index:)
                                       description: 'How to join multiword queries. Defaults to "OR" which means any ' \
                                                    'of the words much match. Options include: "OR", "AND"'
       argument :query_mode, String, required: false, default_value: 'keyword',
-                                    description: 'Search mode, either "keyword" or "semantic"'
+                                    description: 'Search mode: "keyword" (lexical search), "semantic" (vector search), or "hybrid" (both)'
 
       # applied filters
       argument :access_to_files_filter, [String],

diff --git a/app/models/hybrid_query_builder.rb b/app/models/hybrid_query_builder.rb
@@ -0,0 +1,69 @@
+class HybridQueryBuilder
+  def build(params, fulltext: false)
+    query_text = params[:q].to_s.strip
+
+    lexical_query = LexicalQueryBuilder.new.build(params, fulltext: fulltext)
+
+    # If no query text provided, return lexical query so filters/other constraints are still applied
+    return lexical_query if query_text.blank?
+
+    begin
+      semantic_query = SemanticQueryBuilder.new.build(params, fulltext: fulltext)
+
+      # Both succeeded - combine them with should clause while preserving filters
+      combine_queries(semantic_query, lexical_query)
+    rescue SemanticQueryBuilder::LambdaError => e
+      # Lambda service failure - report to Sentry and gracefully fall back to lexical search
+      Sentry.capture_exception(e, level: 'warning')
+      Rails.logger.warn(
+        "HybridQueryBuilder semantic query failed: #{e.class}: #{e.message}"
+      )
+      lexical_query
+    end
+  end
+
+  private
+
+  # Combines semantic and lexical queries while preserving non-q filters.
+  # The q multi_match stays in the lexical branch to allow semantic-only matches.
+  def combine_queries(semantic_query, lexical_query)
+    # Extract filters (non-q constraints like title/citation/geo) to apply at top level.
+    # Do NOT extract must (which contains the q multi_match) - it stays in lexical branch
+    # so semantic matches can be returned without matching the q query.
+    lexical_bool = lexical_query.is_a?(Hash) && lexical_query[:bool] ? lexical_query[:bool] : {}
+    top_level_filters = lexical_bool[:filter] || []
+
+    # Keep the full lexical query structure (with q multi_match in must) but remove filters
+    # so we don't duplicate them in the final query
+    lexical_search = if lexical_query.is_a?(Hash) && lexical_query[:bool]
+                       {
+                         bool: {
+                           should: lexical_bool[:should] || [],
+                           must: lexical_bool[:must] || []
+                         }.reject { |_, v| v.blank? }
+                       }
+                     else
+                       lexical_query
+                     end
+
+    hybrid_bool = {
+      should: [
+        semantic_query,
+        lexical_search
+      ]
+    }
+
+    # Apply only filters (non-q constraints) at top level so they apply to both branches
+    hybrid_bool[:filter] = top_level_filters if top_level_filters.present?
+
+    # In OpenSearch, when a bool query has no filters, should clauses are required by default.
+    # When filters are added, should clauses become optional. We explicitly require at least
+    # one should clause to match (semantic or lexical) when filters are present, so we don't
+    # return filter-only results that matched neither branch.
+    hybrid_bool[:minimum_should_match] = 1 if top_level_filters.present?
+
+    {
+      bool: hybrid_bool
+    }
+  end
+end
diff --git a/app/models/opensearch.rb b/app/models/opensearch.rb
@@ -57,6 +57,8 @@ def query
     builder = case @query_mode
               when 'semantic'
                 SemanticQueryBuilder.new
+              when 'hybrid'
+                HybridQueryBuilder.new
               else
                 LexicalQueryBuilder.new
               end

diff --git a/app/models/semantic_query_builder.rb b/app/models/semantic_query_builder.rb
@@ -1,4 +1,7 @@
 class SemanticQueryBuilder
+  # Dedicated exception for Lambda invocation failures (not parsing/validation errors)
+  class LambdaError < StandardError; end
+
   def build(params, fulltext: false)
     query_text = params[:q].to_s.strip
 
@@ -13,16 +16,23 @@ def build(params, fulltext: false)
 
   def invoke_semantic_builder(query_text)
     payload = { query: query_text }
+    function_name = ENV.fetch('TIMDEX_SEMANTIC_BUILDER_FUNCTION_NAME')
 
-    response = Timdex::LambdaClient.invoke(
-      function_name: ENV.fetch('TIMDEX_SEMANTIC_BUILDER_FUNCTION_NAME'),
-      invocation_type: 'RequestResponse',
-      payload: payload.to_json
-    )
+    begin
+      response = Timdex::LambdaClient.invoke(
+        function_name: function_name,
+        invocation_type: 'RequestResponse',
+        payload: payload.to_json
+      )
+    rescue StandardError => e
+      # All errors from the Lambda service call are wrapped in LambdaError
+      # so HybridQueryBuilder can catch it and gracefully fall back to lexical search.
+      raise LambdaError, "Lambda invocation error: #{e.message}", e.backtrace
+    end
 
+    # Response parsing below is outside the rescue block, so JSON/validation errors
+    # propagate as-is and fail fast rather than triggering graceful fallback.
     parse_lambda_payload(response.payload)
-  rescue StandardError => e
-    raise "Semantic query builder Lambda error: #{e.message}"
   end
 
   def parse_lambda_payload(payload)
@@ -39,12 +49,26 @@ def parse_lambda_payload(payload)
 
   def parse_lambda_response(lambda_response)
     # Lambda returns: { "query": { "bool": { "should": [...] } } }
-    # We extract and return just the inner query object
+    # We extract and return just the inner query object with keys normalized to symbols
     raise "Invalid semantic query builder response: missing 'query' key" unless lambda_response.key?('query')
 
     query = lambda_response['query']
     raise 'Invalid semantic query builder response: query must be a Hash' unless query.is_a?(Hash)
 
-    query
+    # Normalize string keys to symbols for consistency with LexicalQueryBuilder
+    normalize_keys(query)
+  end
+
+  # Recursively converts all string keys to symbols in hashes and nested structures.
+  def normalize_keys(value)
+    case value
+    when Hash
+      value.transform_keys { |k| k.is_a?(String) ? k.to_sym : k }
+           .transform_values { |v| normalize_keys(v) }
+    when Array
+      value.map { |item| normalize_keys(item) }
+    else
+      value
+    end
   end
 end
diff --git a/config/initializers/lambda.rb b/config/initializers/lambda.rb
@@ -1,20 +1,17 @@
 require 'aws-sdk-lambda'
 
 def configure_lambda_client
-  if ENV['AWS_SESSION_TOKEN'].present?
-    Aws::Lambda::Client.new(
-      region: ENV.fetch('AWS_REGION', 'us-east-1'),
-      access_key_id: ENV.fetch('AWS_ACCESS_KEY_ID'),
-      secret_access_key: ENV.fetch('AWS_SECRET_ACCESS_KEY'),
-      session_token: ENV.fetch('AWS_SESSION_TOKEN')
-    )
-  else
-    Aws::Lambda::Client.new(
-      region: ENV.fetch('AWS_REGION', 'us-east-1'),
-      access_key_id: ENV.fetch('AWS_ACCESS_KEY_ID'),
-      secret_access_key: ENV.fetch('AWS_SECRET_ACCESS_KEY')
-    )
-  end
+  options = {
+    region: ENV.fetch('AWS_REGION', 'us-east-1'),
+    access_key_id: ENV.fetch('AWS_ACCESS_KEY_ID'),
+    secret_access_key: ENV.fetch('AWS_SECRET_ACCESS_KEY')
+  }
+  options[:session_token] = ENV['AWS_SESSION_TOKEN'] if ENV['AWS_SESSION_TOKEN'].present?
+
+  # AWS SDK sets this env in prod. However, we need to conditionally set it for tests so VCR can
+  # intercept the requests with a fake URL.
+  options[:endpoint] = ENV['AWS_ENDPOINT_URL_LAMBDA'] if ENV['AWS_ENDPOINT_URL_LAMBDA'].present?
+  Aws::Lambda::Client.new(options)
 end
 
 Timdex::LambdaClient = configure_lambda_client
diff --git a/test/controllers/graphql_controller_test.rb b/test/controllers/graphql_controller_test.rb
@@ -956,4 +956,110 @@ class GraphqlControllerTest < ActionDispatch::IntegrationTest
       end
     end
   end
+
+  test 'graphql search with queryMode keyword uses lexical builder' do
+    VCR.use_cassette('opensearch init') do
+      VCR.use_cassette('graphql search data analytics keyword') do
+        post '/graphql', params: { query: '{
+                                    search(searchterm: "data analytics", queryMode: "keyword") {
+                                      records {
+                                        title
+                                        score
+                                      }
+                                    }
+                                  }' }
+        assert_equal(200, response.status)
+        json = JSON.parse(response.body)
+
+        # Verify results are present with no errors
+        assert_nil(json['errors'])
+        assert(json['data']['search']['records'].any?)
+      end
+    end
+  end
+
+  test 'graphql search with queryMode semantic uses semantic builder' do
+    VCR.use_cassette('opensearch init') do
+      VCR.use_cassette('graphql search data analytics semantic') do
+        post '/graphql', params: { query: '{
+                                    search(searchterm: "data analytics", queryMode: "semantic") {
+                                      records {
+                                        title
+                                      }
+                                    }
+                                  }' }
+        assert_equal(200, response.status)
+        json = JSON.parse(response.body)
+
+        # Verify results are present with no errors
+        assert_nil(json['errors'])
+        assert(json['data']['search']['records'].any?, 'Expected search results')
+      end
+    end
+  end
+
+  test 'graphql search with queryMode hybrid combines semantic and lexical results' do
+    VCR.use_cassette('opensearch init') do
+      VCR.use_cassette('graphql search data analytics hybrid') do
+        post '/graphql', params: { query: '{
+                                    search(searchterm: "data analytics", queryMode: "hybrid") {
+                                      records {
+                                        title
+                                        score
+                                      }
+                                    }
+                                  }' }
+        assert_equal(200, response.status)
+        json = JSON.parse(response.body)
+
+        # Verify results are present with no errors
+        assert_nil(json['errors'])
+        assert(json['data']['search']['records'].any?)
+      end
+    end
+  end
+
+  test 'graphql search with filter only (no searchterm) returns filtered results' do
+    VCR.use_cassette('opensearch init') do
+      VCR.use_cassette('graphql search title') do
+        post '/graphql', params: { query: '{
+                                    search(title: "Spice") {
+                                      records {
+                                        title
+                                      }
+                                    }
+                                  }' }
+        assert_equal(200, response.status)
+        json = JSON.parse(response.body)
+
+        # Verify results are present with no errors
+        assert_nil(json['errors'])
+        assert(json['data']['search']['records'].any?)
+
+        # Verify results match the filter
+        assert(json['data']['search']['records'].any? { |r| r['title'].include?('Spice') })
+      end
+    end
+  end
+
+  test 'graphql search defaults to lexical when queryMode not provided' do
+    VCR.use_cassette('opensearch init') do
+      VCR.use_cassette('graphql search data analytics') do
+        post '/graphql', params: { query: '{
+                                    search(searchterm: "data analytics") {
+                                      records {
+                                        title
+                                        score
+                                      }
+                                    }
+                                  }' }
+        assert_equal(200, response.status)
+        json = JSON.parse(response.body)
+
+        # Verify results are present with no errors
+        assert_nil(json['errors'])
+        assert(json['data']['search']['records'].any?)
+      end
+    end
+  end
 end