curl --request GET \
--url https://api.example.com/api/evals/compare{
"avg_context_recall": {
"baseline": 0.5,
"candidate": 0.5,
"delta": 123
},
"avg_factual_correctness": {
"baseline": 0.5,
"candidate": 0.5,
"delta": 123
},
"avg_faithfulness": {
"baseline": 0.5,
"candidate": 0.5,
"delta": 123
},
"baseline": {
"avg_context_recall": 0.5,
"avg_factual_correctness": 0.5,
"avg_faithfulness": 0.5,
"dataset_version": "<string>",
"run_id": "<string>",
"timestamp": "2023-11-07T05:31:56Z",
"total_cases": 1,
"avg_response_relevancy": 0.5,
"commit_sha": "<string>",
"embedding_model": "<string>",
"gate_passed": true,
"llm_model": "<string>",
"reranker_available_pct": 0.5,
"reranker_model": "<string>"
},
"baseline_only_cases": 1,
"candidate": {
"avg_context_recall": 0.5,
"avg_factual_correctness": 0.5,
"avg_faithfulness": 0.5,
"dataset_version": "<string>",
"run_id": "<string>",
"timestamp": "2023-11-07T05:31:56Z",
"total_cases": 1,
"avg_response_relevancy": 0.5,
"commit_sha": "<string>",
"embedding_model": "<string>",
"gate_passed": true,
"llm_model": "<string>",
"reranker_available_pct": 0.5,
"reranker_model": "<string>"
},
"candidate_only_cases": 1,
"case_comparisons": [
{
"status": "matched",
"test_case_id": "<string>",
"baseline_context_recall": 0.5,
"baseline_factual_correctness": 0.5,
"baseline_faithfulness": 0.5,
"baseline_latency_ms": 1,
"baseline_response_relevancy": 0.5,
"candidate_context_recall": 0.5,
"candidate_factual_correctness": 0.5,
"candidate_faithfulness": 0.5,
"candidate_latency_ms": 1,
"candidate_response_relevancy": 0.5,
"delta_context_recall": 123,
"delta_factual_correctness": 123,
"delta_faithfulness": 123,
"delta_latency_ms": 123,
"delta_response_relevancy": 123
}
],
"matched_cases": 1,
"avg_response_relevancy": {
"baseline": 0.5,
"candidate": 0.5,
"delta": 123
}
}Compare two eval runs and return metric deltas.
curl --request GET \
--url https://api.example.com/api/evals/compare{
"avg_context_recall": {
"baseline": 0.5,
"candidate": 0.5,
"delta": 123
},
"avg_factual_correctness": {
"baseline": 0.5,
"candidate": 0.5,
"delta": 123
},
"avg_faithfulness": {
"baseline": 0.5,
"candidate": 0.5,
"delta": 123
},
"baseline": {
"avg_context_recall": 0.5,
"avg_factual_correctness": 0.5,
"avg_faithfulness": 0.5,
"dataset_version": "<string>",
"run_id": "<string>",
"timestamp": "2023-11-07T05:31:56Z",
"total_cases": 1,
"avg_response_relevancy": 0.5,
"commit_sha": "<string>",
"embedding_model": "<string>",
"gate_passed": true,
"llm_model": "<string>",
"reranker_available_pct": 0.5,
"reranker_model": "<string>"
},
"baseline_only_cases": 1,
"candidate": {
"avg_context_recall": 0.5,
"avg_factual_correctness": 0.5,
"avg_faithfulness": 0.5,
"dataset_version": "<string>",
"run_id": "<string>",
"timestamp": "2023-11-07T05:31:56Z",
"total_cases": 1,
"avg_response_relevancy": 0.5,
"commit_sha": "<string>",
"embedding_model": "<string>",
"gate_passed": true,
"llm_model": "<string>",
"reranker_available_pct": 0.5,
"reranker_model": "<string>"
},
"candidate_only_cases": 1,
"case_comparisons": [
{
"status": "matched",
"test_case_id": "<string>",
"baseline_context_recall": 0.5,
"baseline_factual_correctness": 0.5,
"baseline_faithfulness": 0.5,
"baseline_latency_ms": 1,
"baseline_response_relevancy": 0.5,
"candidate_context_recall": 0.5,
"candidate_factual_correctness": 0.5,
"candidate_faithfulness": 0.5,
"candidate_latency_ms": 1,
"candidate_response_relevancy": 0.5,
"delta_context_recall": 123,
"delta_factual_correctness": 123,
"delta_faithfulness": 123,
"delta_latency_ms": 123,
"delta_response_relevancy": 123
}
],
"matched_cases": 1,
"avg_response_relevancy": {
"baseline": 0.5,
"candidate": 0.5,
"delta": 123
}
}Baseline run_id
1Candidate run_id
1Successful Response
Response for comparing two eval runs.
Delta between two metric values.
Show child attributes
Delta between two metric values.
Show child attributes
Delta between two metric values.
Show child attributes
Run summary without per-case results (for list endpoint).
Show child attributes
x >= 0Run summary without per-case results (for list endpoint).
Show child attributes
x >= 0Show child attributes
x >= 0Delta between two metric values.
Show child attributes