Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions mellea/backends/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -832,14 +832,7 @@ async def _generate_from_chat_context_standard(
)
# Convert our linearized context into a sequence of chat messages. Template formatters have a standard way of doing this.
messages: list[Message] = self.formatter.to_chat_messages(linearized_context)
# Add the final message.
match action:
case ALoraRequirement():
raise Exception(
"The OpenAI backend does not currently support activated LoRAs."
)
case _:
messages.extend(self.formatter.to_chat_messages([action]))
messages.extend(self.formatter.to_chat_messages([action]))
conversation: list[dict] = []

system_prompt = model_opts.get(ModelOption.SYSTEM_PROMPT, "")
Expand Down
21 changes: 18 additions & 3 deletions test/stdlib/components/intrinsic/test_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,21 @@ def test_context_relevance(backend_4_0):
assert result == "irrelevant"


def _compare_hallucination(result: list[dict], expected: list[dict]):
"""Special function to compare the result and expected output for hallucination detection.

There are slight differences in explanations depending on where the test is run.
"""
for r, e in zip(result, expected, strict=True):
assert r["response_begin"] == e["response_begin"]
assert r["response_end"] == e["response_end"]
assert r["response_text"] == e["response_text"]
assert r["faithfulness"] == e["faithfulness"]

# Specifically don't check the explanation due to mentioned differences.
# assert result["explanation"] == expected["explanation"]


@pytest.mark.qualitative
def test_hallucination_detection(backend):
"""Verify that the hallucination detection intrinsic functions properly."""
Expand All @@ -196,11 +211,11 @@ def test_hallucination_detection(backend):
# First call triggers adapter loading
result = rag.flag_hallucinated_content(assistant_response, docs, context, backend)
_dump_output_json("hallucination_detection.json", result)
assert result == expected
_compare_hallucination(result, expected)

# Second call hits a different code path from the first one
result = rag.flag_hallucinated_content(assistant_response, docs, context, backend)
assert result == expected
_compare_hallucination(result, expected)


@pytest.mark.qualitative
Expand Down Expand Up @@ -303,7 +318,7 @@ def test_hallucination_detection_resolve(backend):
expected = _read_output_json("hallucination_detection.json")

result = rag.flag_hallucinated_content(None, docs, context, backend)
assert result == expected
_compare_hallucination(result, expected)


@pytest.mark.qualitative
Expand Down
Loading