{
    "created": 1769097935.926987,
    "progress": 1.0,
    "progress_message": "DONE",
    "status": 0,
    "error": "",
    "interpretation_key": "25269fb8-8cb5-4879-be2f-3570da92adc1",
    "interpretation_parameters": {
        "model": null,
        "models": "<class 'dict_values'>",
        "dataset": "<class 'h2o_sonar.lib.api.datasets._datasets_genai.LlmDataset'>",
        "validset": null,
        "testset": null,
        "use_raw_features": true,
        "target_col": "",
        "weight_col": "",
        "prediction_col": "",
        "drop_cols": [],
        "sample_num_rows": 0,
        "results_location": "/tmp/pytest-of-dvorka/pytest-0/test_evaluate_and_compare__hom4",
        "used_features": null
    },
    "explainers": [
        {
            "h2o_sonar.evaluators.answer_accuracy_evaluator.AnswerAccuracyEvaluator": {}
        },
        {
            "h2o_sonar.evaluators.answer_semantic_similarity_per_sentence_evaluator.AnswerSemanticSimilarityPerSentenceEvaluator": {}
        },
        {
            "h2o_sonar.evaluators.rouge_evaluator.RougeEvaluator": {}
        },
        {
            "h2o_sonar.evaluators.rag_groundedness_evaluator.RagGroundednessEvaluator": {}
        },
        {
            "h2o_sonar.evaluators.rag_tokens_presence_evaluator.RagStrStrEvaluator": {}
        },
        {
            "h2o_sonar.evaluators.rag_chunk_relevancy_evaluator.ContextChunkRelevancyEvaluator": {}
        },
        {
            "h2o_sonar.evaluators.rag_context_mean_reciprocal_rank_evaluator.MeanReciprocalRankEvaluator": {}
        }
    ],
    "result": {
        "dataset": {
            "data": "<class 'datatable.Frame'>",
            "metadata": {
                "shape": "(50, 14)",
                "row_count": 50,
                "column_names": [
                    "key",
                    "input",
                    "corpus",
                    "context",
                    "categories",
                    "relationships",
                    "expected_output",
                    "output_constraints",
                    "output_condition",
                    "actual_output",
                    "actual_duration",
                    "cost",
                    "model_key",
                    "test_key"
                ],
                "column_types": [
                    "str",
                    "str",
                    "str",
                    "str",
                    "str",
                    "str",
                    "str",
                    "str",
                    "str",
                    "str",
                    "real",
                    "real",
                    "str",
                    "str"
                ],
                "column_uniques": [
                    50,
                    49,
                    1,
                    50,
                    1,
                    1,
                    50,
                    47,
                    47,
                    50,
                    50,
                    50,
                    1,
                    1
                ],
                "columns_cat": [],
                "columns_num": [],
                "file_path": "",
                "file_name": "",
                "file_size": 0,
                "missing_values": [
                    "",
                    "?",
                    "None",
                    "nan",
                    "NA",
                    "N/A",
                    "unknown",
                    "inf",
                    "-inf",
                    "1.7976931348623157e+308",
                    "-1.7976931348623157e+308"
                ],
                "columns_meta": [
                    {
                        "name": "key",
                        "data_type": "str",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": false,
                        "is_categorical": true,
                        "count": 50,
                        "frequency": 0,
                        "unique": 50,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    },
                    {
                        "name": "input",
                        "data_type": "str",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": false,
                        "is_categorical": true,
                        "count": 49,
                        "frequency": 0,
                        "unique": 49,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    },
                    {
                        "name": "corpus",
                        "data_type": "str",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": false,
                        "is_categorical": true,
                        "count": 1,
                        "frequency": 0,
                        "unique": 1,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    },
                    {
                        "name": "context",
                        "data_type": "str",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": false,
                        "is_categorical": true,
                        "count": 50,
                        "frequency": 0,
                        "unique": 50,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    },
                    {
                        "name": "categories",
                        "data_type": "str",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": false,
                        "is_categorical": true,
                        "count": 1,
                        "frequency": 0,
                        "unique": 1,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    },
                    {
                        "name": "relationships",
                        "data_type": "str",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": false,
                        "is_categorical": true,
                        "count": 1,
                        "frequency": 0,
                        "unique": 1,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    },
                    {
                        "name": "expected_output",
                        "data_type": "str",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": false,
                        "is_categorical": true,
                        "count": 50,
                        "frequency": 0,
                        "unique": 50,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    },
                    {
                        "name": "output_constraints",
                        "data_type": "str",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": false,
                        "is_categorical": true,
                        "count": 47,
                        "frequency": 0,
                        "unique": 47,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    },
                    {
                        "name": "output_condition",
                        "data_type": "str",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": false,
                        "is_categorical": true,
                        "count": 47,
                        "frequency": 0,
                        "unique": 47,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    },
                    {
                        "name": "actual_output",
                        "data_type": "str",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": false,
                        "is_categorical": true,
                        "count": 50,
                        "frequency": 0,
                        "unique": 50,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    },
                    {
                        "name": "actual_duration",
                        "data_type": "real",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": true,
                        "is_categorical": false,
                        "count": 50,
                        "frequency": 0,
                        "unique": 50,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    },
                    {
                        "name": "cost",
                        "data_type": "real",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": true,
                        "is_categorical": false,
                        "count": 50,
                        "frequency": 0,
                        "unique": 50,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    },
                    {
                        "name": "model_key",
                        "data_type": "str",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": false,
                        "is_categorical": true,
                        "count": 1,
                        "frequency": 0,
                        "unique": 1,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    },
                    {
                        "name": "test_key",
                        "data_type": "str",
                        "logical_types": [],
                        "format": "",
                        "is_id": false,
                        "is_numeric": false,
                        "is_categorical": true,
                        "count": 1,
                        "frequency": 0,
                        "unique": 1,
                        "max": null,
                        "min": null,
                        "mean": null,
                        "std": null,
                        "histogram_counts": [],
                        "histogram_ticks": []
                    }
                ],
                "original_dataset_sampled": false,
                "original_dataset_path": "",
                "original_dataset_size": 0,
                "original_dataset_shape": [
                    50,
                    14
                ]
            }
        },
        "testset": {},
        "validset": {},
        "model": {},
        "models": [
            {
                "connection": "743fc0d5-60b1-4e9e-9548-7f3921c0d4fe",
                "model_type": "h2ogpte",
                "name": "RAG model - LLM: claude-3-7-sonnet-20250219, corpus: ['sr1107a1.pdf']",
                "collection_id": "d3020997-052d-43fb-9405-74a60781d6a6",
                "collection_name": "Ephemeral H2O Sonar RAG collection (docs: ['sr1107a1.pdf'])",
                "llm_model_name": "claude-3-7-sonnet-20250219",
                "documents": [
                    "https://www.federalreserve.gov/supervisionreg/srletters/sr1107a1.pdf"
                ],
                "model_cfg": {
                    "embedding_model": "BAAI/bge-large-en-v1.5"
                },
                "key": "7aaf642f-a2d8-4ceb-a873-05731d1d3b42",
                "llm_model_meta": {
                    "h2ogpte_perf_stats": {
                        "llm_name": "claude-3-7-sonnet-20250219",
                        "call_count": 188,
                        "input_tokens": 769589,
                        "output_tokens": 32847,
                        "tokens_per_second": 22.759,
                        "time_to_first_token": 1.0928215,
                        "vision_model_name": "claude-3-7-sonnet-20250219"
                    },
                    "success_count": 50,
                    "retry_count": 0,
                    "timeout_count": 0,
                    "failure_count": 0,
                    "duration_stats": {
                        "max": 19.101261377334595,
                        "min": 5.742784261703491,
                        "n": 50,
                        "sum": 538.1754705905914
                    }
                }
            }
        ],
        "all_explainer_ids": [
            "h2o_sonar.evaluators.answer_accuracy_evaluator.AnswerAccuracyEvaluator",
            "h2o_sonar.evaluators.answer_semantic_similarity_per_sentence_evaluator.AnswerSemanticSimilarityPerSentenceEvaluator",
            "h2o_sonar.evaluators.rouge_evaluator.RougeEvaluator",
            "h2o_sonar.evaluators.rag_groundedness_evaluator.RagGroundednessEvaluator",
            "h2o_sonar.evaluators.rag_tokens_presence_evaluator.RagStrStrEvaluator",
            "h2o_sonar.evaluators.rag_chunk_relevancy_evaluator.ContextChunkRelevancyEvaluator",
            "h2o_sonar.evaluators.rag_context_mean_reciprocal_rank_evaluator.MeanReciprocalRankEvaluator"
        ],
        "incompatible_explainer_ids": [],
        "incompatible_explainers": {},
        "scheduled_explainers": [
            "h2o_sonar.evaluators.answer_accuracy_evaluator.AnswerAccuracyEvaluator",
            "h2o_sonar.evaluators.answer_semantic_similarity_per_sentence_evaluator.AnswerSemanticSimilarityPerSentenceEvaluator",
            "h2o_sonar.evaluators.rouge_evaluator.RougeEvaluator",
            "h2o_sonar.evaluators.rag_groundedness_evaluator.RagGroundednessEvaluator",
            "h2o_sonar.evaluators.rag_tokens_presence_evaluator.RagStrStrEvaluator",
            "h2o_sonar.evaluators.rag_chunk_relevancy_evaluator.ContextChunkRelevancyEvaluator",
            "h2o_sonar.evaluators.rag_context_mean_reciprocal_rank_evaluator.MeanReciprocalRankEvaluator"
        ],
        "explainers_parameters": {
            "h2o_sonar.evaluators.answer_accuracy_evaluator.AnswerAccuracyEvaluator": {
                "metric_threshold": 0.75,
                "save_llm_result": true,
                "sentence_level_metrics": true,
                "min_test_cases": 0,
                "short_string_metric": "normalized_edit_distance",
                "short_string_threshold": 10
            },
            "h2o_sonar.evaluators.answer_semantic_similarity_per_sentence_evaluator.AnswerSemanticSimilarityPerSentenceEvaluator": {
                "metric_threshold": 0.75,
                "save_llm_result": true,
                "sentence_level_metrics": true
            },
            "h2o_sonar.evaluators.rouge_evaluator.RougeEvaluator": {
                "metric_threshold": 0.75,
                "save_llm_result": true,
                "min_test_cases": 0
            },
            "h2o_sonar.evaluators.rag_groundedness_evaluator.RagGroundednessEvaluator": {
                "metric_threshold": 0.75,
                "save_llm_result": true,
                "sentence_level_metrics": true,
                "min_test_cases": 0
            },
            "h2o_sonar.evaluators.rag_tokens_presence_evaluator.RagStrStrEvaluator": {
                "metric_threshold": 0.5,
                "save_llm_result": true,
                "evaluate_retrieved_context": false
            },
            "h2o_sonar.evaluators.rag_chunk_relevancy_evaluator.ContextChunkRelevancyEvaluator": {
                "metric_threshold": 0.75,
                "save_llm_result": true
            },
            "h2o_sonar.evaluators.rag_context_mean_reciprocal_rank_evaluator.MeanReciprocalRankEvaluator": {
                "mrr_relevant_chunk_threshold": 0.7,
                "mrr_relevant_chunk_oor_idx": 10,
                "metric_threshold": 0.75,
                "save_llm_result": true
            }
        },
        "executed_explainers": [
            {
                "key": "6deba14a-31cf-48df-aa08-df28e5c3cc30",
                "progress": 1.0,
                "status": 0,
                "error": "",
                "message": "DONE",
                "explainer": {
                    "id": "h2o_sonar.evaluators.answer_accuracy_evaluator.AnswerAccuracyEvaluator",
                    "name": "AnswerAccuracyEvaluator",
                    "display_name": "Answer accuracy (semantic similarity)",
                    "tagline": "Evaluate actual answers by comparing them to expected answers using semantic similarity.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n|   | \u2713 |   | \u2713 |   |\n\n**Description**:\n\nAnswer Accuracy Evaluator assesses how closely the\nactual answer matches the expected answer. It measures semantic similarity between\nthe expected answer and actual answer sentences - as the actual answer generated by\nthe RAG/LLM model **should match** the expected answer.\n\n**Method**:\n\n- The answer accuracy metric is calculated as:\n\n```math\nanswer_acc = min( { max( {S(emb(a), emb(e)): for all e in E} ): for all a in A } )\n```\n\n- Where:\n    - `A` is the actual answer.\n    - `emb(a)` is a vector embedding of the actual answer sentence.\n    - `E` is the expected answer.\n    - `emb(e)` is a vector embedding of the expected answer sentence.\n    - `S(a, e)` is the 1 - cosine distance between the actual answer sentence `a`\n      and the expected answer sentence `e`.\n- The evaluator uses **embeddings**\n  [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en) (where BGE\n  stands for \"BAAI General Embedding\" which refers to a suite of open-source text\n  embedding models developed by the Beijing Academy of Artificial Intelligence (BAAI)).\n- For short answers (either expected or actual \u2264 `short_string_threshold` characters),\n  embedding-based similarity is not ideal as it cannot be calculated. Instead,\n  the evaluator uses a fallback metric specified by `short_string_metric`:\n    - **normalized_edit_distance** (default): Normalized Levenshtein distance,\n      good for handling typos and case differences.\n    - **exact_match**: Strict case-insensitive matching, ideal for Yes/No answers.\n    - **token_jaccard**: Token overlap similarity, suitable for short multi-word\n      phrases.\n    - **embeddings**: Force embeddings anyway which may and probably will result\n      in `NaN`s metric scores.\n- This ensures accurate evaluation when either answer is short, like \"Yes\", \"No\",\n  \"$25\", \"90 days\", etc., which would otherwise be filtered out during sentence\n  tokenization.\n\n\n**Metrics** calculated by the evaluator:\n\n- **Answer Accuracy** (float)\n    - Answer Accuracy metric determines how closely the actual answer matches the expected answer by **comparing** the actual answer sentences to the expected answer sentences using semantic similarity.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n    - This is **primary** metric.\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Best performing LLM model based on the evaluated primary metric.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n- The least accurate actual answer sentence (in case that the output metric score is below the threshold).\n\nEvaluator **parameters**:\n\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.75`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n- `sentence_level_metrics` (bool):\n    - Controls whether sentence level metrics are generated.\n    - Default value: `True`\n- `min_test_cases` (int):\n    - Minimum number of test cases, which produces useful results.\n    - Default value: `\"\"`\n- `short_string_metric` (str):\n    - Metric to use for short strings (length <= short_string_threshold). Options: 'normalized_edit_distance' (default, good for handling typos and case differences), 'exact_match' (strict matching, good for Yes/No answers), 'token_jaccard' (token overlap, good for short multi-word phrases), 'embeddings' (force embeddings anyway, may result in NaN for very short strings)\n    - Default value: `normalized_edit_distance`\n- `short_string_threshold` (int):\n    - Character length threshold below which to use short string metric instead of embedding-based similarity. When either the expected or actual answer is at or below this threshold, the short_string_metric is used.\n    - Default value: `10`\n",
                    "brief_description": "Answer Accuracy Evaluator assesses how closely the\nactual answer matches the expected answer. It measures semantic similarity between\nthe expected answer and actual answer sentences - as the actual answer generated by\nthe RAG/LLM model **should match** the expected answer.",
                    "model_types": [
                        "llm",
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope",
                        "local_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "LlmEvalResultsExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        },
                        {
                            "explanation_type": "global-llm-heatmap-leaderboard",
                            "name": "LlmHeatmapLeaderboardExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        },
                        {
                            "explanation_type": "global-work-dir-archive",
                            "name": "WorkDirArchiveExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        }
                    ],
                    "keywords": [
                        "hardware-gpu-optional",
                        "llm",
                        "evaluates_llm",
                        "evaluates_rag",
                        "requires_expected_answer",
                        "requires_actual_answer",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "problem-type-information-retrieval",
                        "problem-type-question-answering",
                        "evaluator-role-regulator",
                        "es-purpose-generation",
                        "evaluation-method-semantic-similarity",
                        "evaluation-type-deterministic",
                        "capability-answer-highlight"
                    ],
                    "parameters": [
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.75,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "sentence_level_metrics",
                            "description": "Controls whether sentence level metrics are generated.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "min_test_cases",
                            "description": "Minimum number of test cases, which produces useful results.",
                            "comment": "",
                            "type": "int",
                            "val": 0,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "short_string_metric",
                            "description": "Metric to use for short strings (length <= short_string_threshold). Options: 'normalized_edit_distance' (default, good for handling typos and case differences), 'exact_match' (strict matching, good for Yes/No answers), 'token_jaccard' (token overlap, good for short multi-word phrases), 'embeddings' (force embeddings anyway, may result in NaN for very short strings)",
                            "comment": "",
                            "type": "str",
                            "val": "normalized_edit_distance",
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "short_string_threshold",
                            "description": "Character length threshold below which to use short string metric instead of embedding-based similarity. When either the expected or actual answer is at or below this threshold, the short_string_metric is used.",
                            "comment": "",
                            "type": "int",
                            "val": 10,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "answer_accuracy",
                            "display_name": "Answer Accuracy",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "Answer Accuracy metric determines how closely the actual answer matches the expected answer by **comparing** the actual answer sentences to the expected answer sentences using semantic similarity.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                },
                "created": 1769097935.9617805,
                "duration": 5.167494535446167,
                "child_explainer_job_keys": null,
                "job_location": "/tmp/pytest-of-dvorka/pytest-0/test_evaluate_and_compare__hom4/h2o-sonar/mli_experiment_25269fb8-8cb5-4879-be2f-3570da92adc1/explainer_h2o_sonar_evaluators_answer_accuracy_evaluator_AnswerAccuracyEvaluator_6deba14a-31cf-48df-aa08-df28e5c3cc30",
                "result_descriptor": {
                    "id": "h2o_sonar.evaluators.answer_accuracy_evaluator.AnswerAccuracyEvaluator",
                    "name": "AnswerAccuracyEvaluator",
                    "display_name": "Answer accuracy (semantic similarity)",
                    "tagline": "Evaluate actual answers by comparing them to expected answers using semantic similarity.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n|   | \u2713 |   | \u2713 |   |\n\n**Description**:\n\nAnswer Accuracy Evaluator assesses how closely the\nactual answer matches the expected answer. It measures semantic similarity between\nthe expected answer and actual answer sentences - as the actual answer generated by\nthe RAG/LLM model **should match** the expected answer.\n\n**Method**:\n\n- The answer accuracy metric is calculated as:\n\n```math\nanswer_acc = min( { max( {S(emb(a), emb(e)): for all e in E} ): for all a in A } )\n```\n\n- Where:\n    - `A` is the actual answer.\n    - `emb(a)` is a vector embedding of the actual answer sentence.\n    - `E` is the expected answer.\n    - `emb(e)` is a vector embedding of the expected answer sentence.\n    - `S(a, e)` is the 1 - cosine distance between the actual answer sentence `a`\n      and the expected answer sentence `e`.\n- The evaluator uses **embeddings**\n  [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en) (where BGE\n  stands for \"BAAI General Embedding\" which refers to a suite of open-source text\n  embedding models developed by the Beijing Academy of Artificial Intelligence (BAAI)).\n- For short answers (either expected or actual \u2264 `short_string_threshold` characters),\n  embedding-based similarity is not ideal as it cannot be calculated. Instead,\n  the evaluator uses a fallback metric specified by `short_string_metric`:\n    - **normalized_edit_distance** (default): Normalized Levenshtein distance,\n      good for handling typos and case differences.\n    - **exact_match**: Strict case-insensitive matching, ideal for Yes/No answers.\n    - **token_jaccard**: Token overlap similarity, suitable for short multi-word\n      phrases.\n    - **embeddings**: Force embeddings anyway which may and probably will result\n      in `NaN`s metric scores.\n- This ensures accurate evaluation when either answer is short, like \"Yes\", \"No\",\n  \"$25\", \"90 days\", etc., which would otherwise be filtered out during sentence\n  tokenization.\n\n\n**Metrics** calculated by the evaluator:\n\n- **Answer Accuracy** (float)\n    - Answer Accuracy metric determines how closely the actual answer matches the expected answer by **comparing** the actual answer sentences to the expected answer sentences using semantic similarity.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n    - This is **primary** metric.\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Best performing LLM model based on the evaluated primary metric.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n- The least accurate actual answer sentence (in case that the output metric score is below the threshold).\n\nEvaluator **parameters**:\n\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.75`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n- `sentence_level_metrics` (bool):\n    - Controls whether sentence level metrics are generated.\n    - Default value: `True`\n- `min_test_cases` (int):\n    - Minimum number of test cases, which produces useful results.\n    - Default value: `\"\"`\n- `short_string_metric` (str):\n    - Metric to use for short strings (length <= short_string_threshold). Options: 'normalized_edit_distance' (default, good for handling typos and case differences), 'exact_match' (strict matching, good for Yes/No answers), 'token_jaccard' (token overlap, good for short multi-word phrases), 'embeddings' (force embeddings anyway, may result in NaN for very short strings)\n    - Default value: `normalized_edit_distance`\n- `short_string_threshold` (int):\n    - Character length threshold below which to use short string metric instead of embedding-based similarity. When either the expected or actual answer is at or below this threshold, the short_string_metric is used.\n    - Default value: `10`\n",
                    "brief_description": "Answer Accuracy Evaluator assesses how closely the\nactual answer matches the expected answer. It measures semantic similarity between\nthe expected answer and actual answer sentences - as the actual answer generated by\nthe RAG/LLM model **should match** the expected answer.",
                    "model_types": [
                        "llm",
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope",
                        "local_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "Evaluation metrics data",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/json",
                                "text/csv",
                                "application/vnd.h2oai.datatable.jay"
                            ]
                        },
                        {
                            "explanation_type": "global-llm-heatmap-leaderboard",
                            "name": "LLM heatmap leaderboard",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/json",
                                "text/markdown",
                                "application/vnd.h2oai-evalstudio-leaderboard.markdown"
                            ]
                        },
                        {
                            "explanation_type": "global-work-dir-archive",
                            "name": "Archive of Answer accuracy (semantic similarity) artifacts",
                            "category": "COMPLIANCE TESTS",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/zip"
                            ]
                        }
                    ],
                    "keywords": [
                        "hardware-gpu-optional",
                        "llm",
                        "evaluates_llm",
                        "evaluates_rag",
                        "requires_expected_answer",
                        "requires_actual_answer",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "problem-type-information-retrieval",
                        "problem-type-question-answering",
                        "evaluator-role-regulator",
                        "es-purpose-generation",
                        "evaluation-method-semantic-similarity",
                        "evaluation-type-deterministic",
                        "capability-answer-highlight"
                    ],
                    "parameters": [
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.75,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "sentence_level_metrics",
                            "description": "Controls whether sentence level metrics are generated.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "min_test_cases",
                            "description": "Minimum number of test cases, which produces useful results.",
                            "comment": "",
                            "type": "int",
                            "val": 0,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "short_string_metric",
                            "description": "Metric to use for short strings (length <= short_string_threshold). Options: 'normalized_edit_distance' (default, good for handling typos and case differences), 'exact_match' (strict matching, good for Yes/No answers), 'token_jaccard' (token overlap, good for short multi-word phrases), 'embeddings' (force embeddings anyway, may result in NaN for very short strings)",
                            "comment": "",
                            "type": "str",
                            "val": "normalized_edit_distance",
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "short_string_threshold",
                            "description": "Character length threshold below which to use short string metric instead of embedding-based similarity. When either the expected or actual answer is at or below this threshold, the short_string_metric is used.",
                            "comment": "",
                            "type": "int",
                            "val": 10,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "answer_accuracy",
                            "display_name": "Answer Accuracy",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "Answer Accuracy metric determines how closely the actual answer matches the expected answer by **comparing** the actual answer sentences to the expected answer sentences using semantic similarity.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                }
            },
            {
                "key": "0a05ea29-a1fc-4015-be5f-b9c6a0d2ae9d",
                "progress": 1.0,
                "status": 0,
                "error": "",
                "message": "DONE",
                "explainer": {
                    "id": "h2o_sonar.evaluators.answer_semantic_similarity_per_sentence_evaluator.AnswerSemanticSimilarityPerSentenceEvaluator",
                    "name": "AnswerSemanticSimilarityPerSentenceEvaluator",
                    "display_name": "Answer semantic sentence similarity",
                    "tagline": "AnswerSemanticSimilarityPerSentenceEvaluator.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n|   | \u2713 |   | \u2713 |   |\n\n**Description**:\n\nAnswer Semantic Similarity Evaluator assesses the semantic\nresemblance between the generated answer and the expected answer (ground truth).\n\n\n**Method**:\n\n- The answer similarity per sentence metrics are calculated as:\n\n```math\nanswer similarity = {max({S(emb(a), emb(e)) : for all e in E}): for all a in A}\nmean answer similarity = mean(answer similarity)\nmin answer similarity = min(answer similarity)\n```\n\n- Where:\n    - `A` is the actual answer.\n    - `emb(a)` is a vector embedding of the actual answer sentence.\n    - `E` is the expected answer.\n    - `emb(e)` is a vector embedding of the expected answer sentence.\n    - `S(emb(a), emb(e))` is the 1 - cosine distance between the embedding of the actual\n      answer sentence `a` and the expected answer sentence `e`.\n- The evaluator uses **embeddings**\n  [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en) (where\n  BGE stands for \"BAAI General Embedding\" which refers to a suite of open-source text\n  embedding models developed by the Beijing Academy of Artificial Intelligence (BAAI)).\n\n\n**Metrics** calculated by the evaluator:\n\n- **Mean Answer Similarity** (float)\n    - The mean of the maximum similarity between the actual answer sentences and the expected answer sentences.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n    - This is **primary** metric.\n- **Min Answer Similarity** (float)\n    - The minimum similarity between the actual answer sentences and the expected answer sentences.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Best performing LLM model based on the evaluated primary metric.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n- The least similar actual answer sentence (in case that the output metric score is below the threshold).\n\nEvaluator **parameters**:\n\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.75`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n- `sentence_level_metrics` (bool):\n    - Controls whether sentence level metrics are generated.\n    - Default value: `True`\n",
                    "brief_description": "Answer Semantic Similarity Evaluator assesses the semantic\nresemblance between the generated answer and the expected answer (ground truth).\n",
                    "model_types": [
                        "llm",
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope",
                        "local_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "LlmEvalResultsExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        },
                        {
                            "explanation_type": "global-llm-heatmap-leaderboard",
                            "name": "LlmHeatmapLeaderboardExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        },
                        {
                            "explanation_type": "global-work-dir-archive",
                            "name": "WorkDirArchiveExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        }
                    ],
                    "keywords": [
                        "hardware-gpu-optional",
                        "llm",
                        "evaluates_llm",
                        "evaluates_rag",
                        "requires_expected_answer",
                        "requires_actual_answer",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "problem-type-information-retrieval",
                        "problem-type-question-answering",
                        "evaluator-role-regulator",
                        "es-purpose-generation",
                        "evaluation-method-semantic-similarity",
                        "evaluation-type-deterministic"
                    ],
                    "parameters": [
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.75,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "sentence_level_metrics",
                            "description": "Controls whether sentence level metrics are generated.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "mean_answer_similarity",
                            "display_name": "Mean Answer Similarity",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "The mean of the maximum similarity between the actual answer sentences and the expected answer sentences.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "min_answer_similarity",
                            "display_name": "Min Answer Similarity",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "The minimum similarity between the actual answer sentences and the expected answer sentences.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                },
                "created": 1769097941.134152,
                "duration": 6.201483249664307,
                "child_explainer_job_keys": null,
                "job_location": "/tmp/pytest-of-dvorka/pytest-0/test_evaluate_and_compare__hom4/h2o-sonar/mli_experiment_25269fb8-8cb5-4879-be2f-3570da92adc1/explainer_h2o_sonar_evaluators_answer_semantic_similarity_per_sentence_evaluator_AnswerSemanticSimilarityPerSentenceEvaluator_0a05ea29-a1fc-4015-be5f-b9c6a0d2ae9d",
                "result_descriptor": {
                    "id": "h2o_sonar.evaluators.answer_semantic_similarity_per_sentence_evaluator.AnswerSemanticSimilarityPerSentenceEvaluator",
                    "name": "AnswerSemanticSimilarityPerSentenceEvaluator",
                    "display_name": "Answer semantic sentence similarity",
                    "tagline": "AnswerSemanticSimilarityPerSentenceEvaluator.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n|   | \u2713 |   | \u2713 |   |\n\n**Description**:\n\nAnswer Semantic Similarity Evaluator assesses the semantic\nresemblance between the generated answer and the expected answer (ground truth).\n\n\n**Method**:\n\n- The answer similarity per sentence metrics are calculated as:\n\n```math\nanswer similarity = {max({S(emb(a), emb(e)) : for all e in E}): for all a in A}\nmean answer similarity = mean(answer similarity)\nmin answer similarity = min(answer similarity)\n```\n\n- Where:\n    - `A` is the actual answer.\n    - `emb(a)` is a vector embedding of the actual answer sentence.\n    - `E` is the expected answer.\n    - `emb(e)` is a vector embedding of the expected answer sentence.\n    - `S(emb(a), emb(e))` is the 1 - cosine distance between the embedding of the actual\n      answer sentence `a` and the expected answer sentence `e`.\n- The evaluator uses **embeddings**\n  [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en) (where\n  BGE stands for \"BAAI General Embedding\" which refers to a suite of open-source text\n  embedding models developed by the Beijing Academy of Artificial Intelligence (BAAI)).\n\n\n**Metrics** calculated by the evaluator:\n\n- **Mean Answer Similarity** (float)\n    - The mean of the maximum similarity between the actual answer sentences and the expected answer sentences.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n    - This is **primary** metric.\n- **Min Answer Similarity** (float)\n    - The minimum similarity between the actual answer sentences and the expected answer sentences.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Best performing LLM model based on the evaluated primary metric.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n- The least similar actual answer sentence (in case that the output metric score is below the threshold).\n\nEvaluator **parameters**:\n\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.75`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n- `sentence_level_metrics` (bool):\n    - Controls whether sentence level metrics are generated.\n    - Default value: `True`\n",
                    "brief_description": "Answer Semantic Similarity Evaluator assesses the semantic\nresemblance between the generated answer and the expected answer (ground truth).\n",
                    "model_types": [
                        "llm",
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope",
                        "local_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "Evaluation metrics data",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/json",
                                "text/csv",
                                "application/vnd.h2oai.datatable.jay"
                            ]
                        },
                        {
                            "explanation_type": "global-llm-heatmap-leaderboard",
                            "name": "LLM heatmap leaderboard",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/json",
                                "text/markdown",
                                "application/vnd.h2oai-evalstudio-leaderboard.markdown"
                            ]
                        },
                        {
                            "explanation_type": "global-html-fragment",
                            "name": "LLM heatmap leaderboard as HTML",
                            "category": "COMPLIANCE TESTS",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "text/html"
                            ]
                        },
                        {
                            "explanation_type": "global-work-dir-archive",
                            "name": "Archive of Answer semantic sentence similarity artifacts",
                            "category": "COMPLIANCE TESTS",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/zip"
                            ]
                        }
                    ],
                    "keywords": [
                        "hardware-gpu-optional",
                        "llm",
                        "evaluates_llm",
                        "evaluates_rag",
                        "requires_expected_answer",
                        "requires_actual_answer",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "problem-type-information-retrieval",
                        "problem-type-question-answering",
                        "evaluator-role-regulator",
                        "es-purpose-generation",
                        "evaluation-method-semantic-similarity",
                        "evaluation-type-deterministic"
                    ],
                    "parameters": [
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.75,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "sentence_level_metrics",
                            "description": "Controls whether sentence level metrics are generated.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "mean_answer_similarity",
                            "display_name": "Mean Answer Similarity",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "The mean of the maximum similarity between the actual answer sentences and the expected answer sentences.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "min_answer_similarity",
                            "display_name": "Min Answer Similarity",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "The minimum similarity between the actual answer sentences and the expected answer sentences.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                }
            },
            {
                "key": "05aca4a2-27a8-407d-a6f7-36153d8ab658",
                "progress": 1.0,
                "status": 0,
                "error": "",
                "message": "DONE",
                "explainer": {
                    "id": "h2o_sonar.evaluators.rouge_evaluator.RougeEvaluator",
                    "name": "RougeEvaluator",
                    "display_name": "ROUGE",
                    "tagline": "Assess the fidelity of generated texts to the reference texts.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n|   | \u2713 |   | \u2713 |   |\n\n**Description**:\n\nROUGE (Recall-Oriented Understudy for Gisting Evaluation)\nis a set of evaluation metrics used to assess the quality of generated summaries\ncompared to reference summaries. There are several variations of ROUGE metrics,\nincluding `ROUGE-1`, `ROUGE-2`, and `ROUGE-L`.\n\n- Compatibility: RAG and LLM models.\n\n**Method**:\n\n- The evaluator reports `F1 score` between the generated (actual answer) and\n  reference (generated answer) n-grams.\n- `ROUGE-1` measures the overlap of 1-grams (individual words) between the generated\n  and the reference summaries.\n- `ROUGE-2` extends the evaluation to 2-grams (pairs of consecutive words).\n- `ROUGE-L` considers the longest common subsequence (LCS) between the generated and\n  reference summaries.\n- These ROUGE metrics provide a quantitative evaluation of the similarity between\n  the generated and reference texts to assess the effectiveness of\n  text summarization algorithms.\n\nSee also:\n\n- 3rd party library ROUGE: https://pypi.org/project/rouge-score/\n- 3rd party ROUGE source code:\n  https://github.com/google-research/google-research/tree/master/rouge\n\n**Metrics** calculated by the evaluator:\n\n- **ROUGE-1** (float)\n    - ROUGE-1 metric measures the overlap of 1-grams (individual words) between the generated and the reference summaries.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n- **ROUGE-2** (float)\n    - ROUGE-1 metric measures the overlap of 2-grams (pairs of consecutive words) between the generated and the reference summaries.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n- **ROUGE-L** (float)\n    - ROUGE-L metric considers the longest common subsequence (LCS) between the generated and reference summaries.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n    - This is **primary** metric.\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Best performing LLM model based on the evaluated primary metric.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n\nEvaluator **parameters**:\n\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.75`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n- `min_test_cases` (int):\n    - Minimum number of test cases, which produces useful results.\n    - Default value: `\"\"`\n",
                    "brief_description": "ROUGE (Recall-Oriented Understudy for Gisting Evaluation)\nis a set of evaluation metrics used to assess the quality of generated summaries\ncompared to reference summaries. There are several variations of ROUGE metrics,\nincluding `ROUGE-1`, `ROUGE-2`, and `ROUGE-L`.\n\n- Compatibility: RAG and LLM models.",
                    "model_types": [
                        "llm",
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "LlmEvalResultsExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        },
                        {
                            "explanation_type": "global-llm-heatmap-leaderboard",
                            "name": "LlmHeatmapLeaderboardExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        }
                    ],
                    "keywords": [
                        "llm",
                        "evaluates_llm",
                        "evaluates_rag",
                        "requires_expected_answer",
                        "requires_actual_answer",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "problem-type-summarization",
                        "es-purpose-summarization",
                        "evaluation-method-ngram",
                        "evaluation-type-deterministic"
                    ],
                    "parameters": [
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.75,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "min_test_cases",
                            "description": "Minimum number of test cases, which produces useful results.",
                            "comment": "",
                            "type": "int",
                            "val": 0,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "rouge_1",
                            "display_name": "ROUGE-1",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "ROUGE-1 metric measures the overlap of 1-grams (individual words) between the generated and the reference summaries.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "rouge_2",
                            "display_name": "ROUGE-2",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "ROUGE-1 metric measures the overlap of 2-grams (pairs of consecutive words) between the generated and the reference summaries.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "rouge_l",
                            "display_name": "ROUGE-L",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "ROUGE-L metric considers the longest common subsequence (LCS) between the generated and reference summaries.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                },
                "created": 1769097947.3424354,
                "duration": 0.34417247772216797,
                "child_explainer_job_keys": null,
                "job_location": "/tmp/pytest-of-dvorka/pytest-0/test_evaluate_and_compare__hom4/h2o-sonar/mli_experiment_25269fb8-8cb5-4879-be2f-3570da92adc1/explainer_h2o_sonar_evaluators_rouge_evaluator_RougeEvaluator_05aca4a2-27a8-407d-a6f7-36153d8ab658",
                "result_descriptor": {
                    "id": "h2o_sonar.evaluators.rouge_evaluator.RougeEvaluator",
                    "name": "RougeEvaluator",
                    "display_name": "ROUGE",
                    "tagline": "Assess the fidelity of generated texts to the reference texts.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n|   | \u2713 |   | \u2713 |   |\n\n**Description**:\n\nROUGE (Recall-Oriented Understudy for Gisting Evaluation)\nis a set of evaluation metrics used to assess the quality of generated summaries\ncompared to reference summaries. There are several variations of ROUGE metrics,\nincluding `ROUGE-1`, `ROUGE-2`, and `ROUGE-L`.\n\n- Compatibility: RAG and LLM models.\n\n**Method**:\n\n- The evaluator reports `F1 score` between the generated (actual answer) and\n  reference (generated answer) n-grams.\n- `ROUGE-1` measures the overlap of 1-grams (individual words) between the generated\n  and the reference summaries.\n- `ROUGE-2` extends the evaluation to 2-grams (pairs of consecutive words).\n- `ROUGE-L` considers the longest common subsequence (LCS) between the generated and\n  reference summaries.\n- These ROUGE metrics provide a quantitative evaluation of the similarity between\n  the generated and reference texts to assess the effectiveness of\n  text summarization algorithms.\n\nSee also:\n\n- 3rd party library ROUGE: https://pypi.org/project/rouge-score/\n- 3rd party ROUGE source code:\n  https://github.com/google-research/google-research/tree/master/rouge\n\n**Metrics** calculated by the evaluator:\n\n- **ROUGE-1** (float)\n    - ROUGE-1 metric measures the overlap of 1-grams (individual words) between the generated and the reference summaries.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n- **ROUGE-2** (float)\n    - ROUGE-1 metric measures the overlap of 2-grams (pairs of consecutive words) between the generated and the reference summaries.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n- **ROUGE-L** (float)\n    - ROUGE-L metric considers the longest common subsequence (LCS) between the generated and reference summaries.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n    - This is **primary** metric.\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Best performing LLM model based on the evaluated primary metric.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n\nEvaluator **parameters**:\n\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.75`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n- `min_test_cases` (int):\n    - Minimum number of test cases, which produces useful results.\n    - Default value: `\"\"`\n",
                    "brief_description": "ROUGE (Recall-Oriented Understudy for Gisting Evaluation)\nis a set of evaluation metrics used to assess the quality of generated summaries\ncompared to reference summaries. There are several variations of ROUGE metrics,\nincluding `ROUGE-1`, `ROUGE-2`, and `ROUGE-L`.\n\n- Compatibility: RAG and LLM models.",
                    "model_types": [
                        "llm",
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "Rouge evaluation results",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/json",
                                "text/csv",
                                "application/vnd.h2oai.datatable.jay"
                            ]
                        },
                        {
                            "explanation_type": "global-llm-heatmap-leaderboard",
                            "name": "ROUGE leaderboard",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/json",
                                "text/markdown",
                                "application/vnd.h2oai-evalstudio-leaderboard.markdown"
                            ]
                        },
                        {
                            "explanation_type": "global-html-fragment",
                            "name": "ROUGE leaderboard as HTML",
                            "category": "COMPLIANCE TESTS",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "text/html"
                            ]
                        }
                    ],
                    "keywords": [
                        "llm",
                        "evaluates_llm",
                        "evaluates_rag",
                        "requires_expected_answer",
                        "requires_actual_answer",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "problem-type-summarization",
                        "es-purpose-summarization",
                        "evaluation-method-ngram",
                        "evaluation-type-deterministic"
                    ],
                    "parameters": [
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.75,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "min_test_cases",
                            "description": "Minimum number of test cases, which produces useful results.",
                            "comment": "",
                            "type": "int",
                            "val": 0,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "rouge_1",
                            "display_name": "ROUGE-1",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "ROUGE-1 metric measures the overlap of 1-grams (individual words) between the generated and the reference summaries.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "rouge_2",
                            "display_name": "ROUGE-2",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "ROUGE-1 metric measures the overlap of 2-grams (pairs of consecutive words) between the generated and the reference summaries.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "rouge_l",
                            "display_name": "ROUGE-L",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "ROUGE-L metric considers the longest common subsequence (LCS) between the generated and reference summaries.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                }
            },
            {
                "key": "3f7a63c5-5b81-470f-8e54-6516498b9438",
                "progress": 1.0,
                "status": 0,
                "error": "",
                "message": "DONE",
                "explainer": {
                    "id": "h2o_sonar.evaluators.rag_groundedness_evaluator.RagGroundednessEvaluator",
                    "name": "RagGroundednessEvaluator",
                    "display_name": "Groundedness (semantic similarity)",
                    "tagline": "Evaluate actual answers by assessing their relevance to the retrieved contexts.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n|   |   | \u2713 | \u2713 |   |\n\n**Description**:\n\nGroundedness Evaluator assesses the groundedness of\nthe base **LLM model** in a Retrieval Augmented Generation (RAG) pipeline. It\nevaluates whether the actual answer is factually correct information by **comparing**\nthe actual answer sentences to the retrieved context sentences - as the actual answer\ngenerated by the LLM model **must be based on** the retrieved context.\n\n**Method**:\n\n- The groundedness metric is calculated as:\n\n```math\ngroundedness = min( { max( {S(emb(a), emb(c)): for all c in C} ): for all a in A } )\n```\n\n- Where:\n    - `A` is the actual answer.\n    - `emb(a)` is a vector embedding of the actual answer sentence.\n    - `C` is the context retrieved by the RAG model.\n    - `emb(c)` is a vector embedding of the context chunk sentence.\n    - `S(a, c)` is the 1 - cosine distance between the actual answer sentence `a`\n      and the retrieved context sentence `c`.\n- The evaluator uses **embeddings**\n  [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en) (where BGE\n  stands for \"BAAI General Embedding\" which refers to a suite of open-source text\n  embedding models developed by the Beijing Academy of Artificial Intelligence (BAAI)).\n\n\n**Metrics** calculated by the evaluator:\n\n- **Groundedness** (float)\n    - Groundedness metric determines whether the RAG outputs factually correct information by comparing the **actual answer** sentences to the retrieved **context** sentences.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n    - This is **primary** metric.\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Best performing LLM model based on the evaluated primary metric.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n- The least grounded actual answer sentence (in case that the output metric score is below the threshold).\n\nEvaluator **parameters**:\n\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.75`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n- `sentence_level_metrics` (bool):\n    - Controls whether sentence level metrics are generated.\n    - Default value: `True`\n- `min_test_cases` (int):\n    - Minimum number of test cases, which produces useful results.\n    - Default value: `\"\"`\n",
                    "brief_description": "Groundedness Evaluator assesses the groundedness of\nthe base **LLM model** in a Retrieval Augmented Generation (RAG) pipeline. It\nevaluates whether the actual answer is factually correct information by **comparing**\nthe actual answer sentences to the retrieved context sentences - as the actual answer\ngenerated by the LLM model **must be based on** the retrieved context.",
                    "model_types": [
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope",
                        "local_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "LlmEvalResultsExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        },
                        {
                            "explanation_type": "global-llm-heatmap-leaderboard",
                            "name": "LlmHeatmapLeaderboardExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        },
                        {
                            "explanation_type": "global-work-dir-archive",
                            "name": "WorkDirArchiveExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        }
                    ],
                    "keywords": [
                        "hardware-gpu-optional",
                        "llm",
                        "evaluates_rag",
                        "requires_retrieved_context",
                        "requires_actual_answer",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "problem-type-information-retrieval",
                        "problem-type-question-answering",
                        "evaluator-role-regulator",
                        "es-purpose-generation",
                        "evaluation-method-semantic-similarity",
                        "evaluation-type-deterministic",
                        "capability-answer-highlight"
                    ],
                    "parameters": [
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.75,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "sentence_level_metrics",
                            "description": "Controls whether sentence level metrics are generated.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "min_test_cases",
                            "description": "Minimum number of test cases, which produces useful results.",
                            "comment": "",
                            "type": "int",
                            "val": 0,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "groundedness",
                            "display_name": "Groundedness",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "Groundedness metric determines whether the RAG outputs factually correct information by comparing the **actual answer** sentences to the retrieved **context** sentences.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                },
                "created": 1769097947.6931243,
                "duration": 28.041447639465332,
                "child_explainer_job_keys": null,
                "job_location": "/tmp/pytest-of-dvorka/pytest-0/test_evaluate_and_compare__hom4/h2o-sonar/mli_experiment_25269fb8-8cb5-4879-be2f-3570da92adc1/explainer_h2o_sonar_evaluators_rag_groundedness_evaluator_RagGroundednessEvaluator_3f7a63c5-5b81-470f-8e54-6516498b9438",
                "result_descriptor": {
                    "id": "h2o_sonar.evaluators.rag_groundedness_evaluator.RagGroundednessEvaluator",
                    "name": "RagGroundednessEvaluator",
                    "display_name": "Groundedness (semantic similarity)",
                    "tagline": "Evaluate actual answers by assessing their relevance to the retrieved contexts.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n|   |   | \u2713 | \u2713 |   |\n\n**Description**:\n\nGroundedness Evaluator assesses the groundedness of\nthe base **LLM model** in a Retrieval Augmented Generation (RAG) pipeline. It\nevaluates whether the actual answer is factually correct information by **comparing**\nthe actual answer sentences to the retrieved context sentences - as the actual answer\ngenerated by the LLM model **must be based on** the retrieved context.\n\n**Method**:\n\n- The groundedness metric is calculated as:\n\n```math\ngroundedness = min( { max( {S(emb(a), emb(c)): for all c in C} ): for all a in A } )\n```\n\n- Where:\n    - `A` is the actual answer.\n    - `emb(a)` is a vector embedding of the actual answer sentence.\n    - `C` is the context retrieved by the RAG model.\n    - `emb(c)` is a vector embedding of the context chunk sentence.\n    - `S(a, c)` is the 1 - cosine distance between the actual answer sentence `a`\n      and the retrieved context sentence `c`.\n- The evaluator uses **embeddings**\n  [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en) (where BGE\n  stands for \"BAAI General Embedding\" which refers to a suite of open-source text\n  embedding models developed by the Beijing Academy of Artificial Intelligence (BAAI)).\n\n\n**Metrics** calculated by the evaluator:\n\n- **Groundedness** (float)\n    - Groundedness metric determines whether the RAG outputs factually correct information by comparing the **actual answer** sentences to the retrieved **context** sentences.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n    - This is **primary** metric.\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Best performing LLM model based on the evaluated primary metric.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n- The least grounded actual answer sentence (in case that the output metric score is below the threshold).\n\nEvaluator **parameters**:\n\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.75`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n- `sentence_level_metrics` (bool):\n    - Controls whether sentence level metrics are generated.\n    - Default value: `True`\n- `min_test_cases` (int):\n    - Minimum number of test cases, which produces useful results.\n    - Default value: `\"\"`\n",
                    "brief_description": "Groundedness Evaluator assesses the groundedness of\nthe base **LLM model** in a Retrieval Augmented Generation (RAG) pipeline. It\nevaluates whether the actual answer is factually correct information by **comparing**\nthe actual answer sentences to the retrieved context sentences - as the actual answer\ngenerated by the LLM model **must be based on** the retrieved context.",
                    "model_types": [
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope",
                        "local_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "Evaluation metrics data",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/json",
                                "text/csv",
                                "application/vnd.h2oai.datatable.jay"
                            ]
                        },
                        {
                            "explanation_type": "global-llm-heatmap-leaderboard",
                            "name": "LLM heatmap leaderboard",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/json",
                                "text/markdown",
                                "application/vnd.h2oai-evalstudio-leaderboard.markdown"
                            ]
                        },
                        {
                            "explanation_type": "global-html-fragment",
                            "name": "LLM heatmap leaderboard as HTML",
                            "category": "COMPLIANCE TESTS",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "text/html"
                            ]
                        },
                        {
                            "explanation_type": "global-work-dir-archive",
                            "name": "Archive of Groundedness (semantic similarity) artifacts",
                            "category": "COMPLIANCE TESTS",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/zip"
                            ]
                        }
                    ],
                    "keywords": [
                        "hardware-gpu-optional",
                        "llm",
                        "evaluates_rag",
                        "requires_retrieved_context",
                        "requires_actual_answer",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "problem-type-information-retrieval",
                        "problem-type-question-answering",
                        "evaluator-role-regulator",
                        "es-purpose-generation",
                        "evaluation-method-semantic-similarity",
                        "evaluation-type-deterministic",
                        "capability-answer-highlight"
                    ],
                    "parameters": [
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.75,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "sentence_level_metrics",
                            "description": "Controls whether sentence level metrics are generated.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "min_test_cases",
                            "description": "Minimum number of test cases, which produces useful results.",
                            "comment": "",
                            "type": "int",
                            "val": 0,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "groundedness",
                            "display_name": "Groundedness",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "Groundedness metric determines whether the RAG outputs factually correct information by comparing the **actual answer** sentences to the retrieved **context** sentences.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                }
            },
            {
                "key": "db54412b-4273-469c-8a59-b4f695389d02",
                "progress": 1.0,
                "status": 0,
                "error": "",
                "message": "DONE",
                "explainer": {
                    "id": "h2o_sonar.evaluators.rag_tokens_presence_evaluator.RagStrStrEvaluator",
                    "name": "RagStrStrEvaluator",
                    "display_name": "Text matching",
                    "tagline": "Evaluate the presence of specific strings in the answers and retrieved contexts.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n| \u2713 |   |   |   | \u2713 |\n\n**Description**:\n\nText Matching Evaluator assesses whether both\nthe retrieved context (in the case of RAG hosted models) and the generated answer\n**contain/match** a specified set of required strings. The evaluation is based on an\nboolean expression (condition) that can be used to define the required strings presence:\n\n- operands are **strings** or **regular expressions**\n- operators are `AND`, `OR`, and `NOT`\n- **parentheses** can be used to group expressions\n\n- **Example 1: Simple string matching**\n   - Expression: `\"15,969\"`\n   - The evaluator will check if the retrieved context and the actual answer\n     contain the string `15,969`. If the condition is satisfied, the test case\n     passes.\n\n- **Example 2: Flexible regex patterns**\n   - Expression: `regexp(\"15,?969\")`\n   - What if the number `15,969` might be expressed as `15969` or `15,969`?\n     The boolean expression can be extended to use a regular expression. The\n     evaluator will check if the retrieved context and the actual answer contain\n     the string `15,969` or `15969`. If the condition is satisfied, the test\n     case passes.\n\n- **Example 3: Combining string and regex**\n   - Expression: `\"15,969\" AND regexp(\"[Mm]illion\")`\n   - The evaluator will check if the retrieved context and the actual answer\n     contain the string `15,969` **and** match the regular expression\n     `[Mm]illion`. If the condition is satisfied, the test case passes.\n\n- **Example 4: Complex boolean logic**\n   - Expression: `(\"Rio\" OR \"rio\") AND regexp(\"15,?969 [Mm]il\") AND NOT \"Real\"`\n   - The evaluator will check if the retrieved context and the actual answer\n     contain either `Rio` or `rio` **and** match the regular expression\n     `15,969 [Mm]il` **and** do not contain the string `Real`. If the\n     condition is satisfied, the test case passes.\n\n- **Example 5: Exact matching with regex anchors**\n   - Expression: `regexp(\"^Brazil revenue was 15,969 million$\")`\n   - The evaluator will check if the retrieved context and the actual answer\n     **exactly** match the regular expression\n     `^Brazil revenue was 15,969 million$`. If the condition is satisfied, the\n     test case passes.\n\n- **Example 6: Case-insensitive matching**\n   - Expression: `regexp(\"(?i)python\")`\n   - The `(?i)` flag enables case-insensitive matching. The evaluator will match\n     `python`, `Python`, `PYTHON`, `PyThOn`, etc. This is useful when the\n     capitalization in the output is unpredictable.\n\n- **Example 7: OR within regular expressions**\n   - Expression: `regexp(\"(cat|dog|bird)\")`\n   - Using the pipe `|` operator inside a group allows matching multiple\n     alternatives. The evaluator will match any of: `cat`, `dog`, or `bird`.\n     This is more concise than using multiple `OR` operators in the boolean\n     expression.\n\n- **Example 8: Capturing groups and word boundaries**\n   - Expression: `regexp(\"\\b(error|warning|failure)\\b\")`\n   - The `\\b` word boundary ensures exact word matching (not as part of a larger\n     word). The regex will match `error`, `warning`, or `failure` as complete\n     words. Parentheses capture the matched text for reference.\n\n- **Example 9: Repeated patterns and quantifiers**\n   - Expression: `regexp(\"\\d3-\\d3-\\d4\")`\n   - Quantifiers specify repetition: `\\d3` matches exactly 3 digits, `+`\n     matches one or more, `*` matches zero or more. This example matches phone\n     numbers in the format `123-456-7890`. Use `\\d` for digits, `\\w` for\n     word characters, `\\s` for whitespace.\n\n- **Example 10: Lookahead and combining patterns**\n   - Expression: `regexp(\"(?i)(success|completed).*\\d+%\")`\n   - This combines case-insensitive matching `(?i)`, an OR group\n     `(success|completed)`, `.*` to match any characters, and `\\d+%` to\n     match one or more digits followed by a percent sign. Useful for matching\n     complex patterns like progress messages.\n\n**Method**:\n\n- The evaluator parses the boolean expression and checks if the retrieved context\n  and the generated answer contain the required strings.\n- The evaluator uses Python `re` module for regular expression matching (`re.search`\n  function). See https://docs.python.org/3/howto/regex.html#regex-howto\n\n\n\n**Metrics** calculated by the evaluator:\n\n- **Model passes** (float)\n    - Percentage of successfully evaluated RAG/LLM outputs.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.5`\n    - This is **primary** metric.\n- **Model failures** (float)\n    - Percentage of RAG/LLM outputs that failed to pass the evaluator check.\n    - Lower score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.5`\n- **Model retrieval failures** (float)\n    - Percentage of RAG's retrieved contexts that failed to pass the evaluator check.\n    - Lower score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.5`\n- **Model generation failures** (float)\n    - Percentage of outputs generated by RAG from the retrieved contexts that failed to pass the evaluator check (equivalent to the model failures).\n    - Lower score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.5`\n- **Model parse failures** (float)\n    - Percentage of RAG/LLM outputs that evaluator's judge (LLM, RAG, agent or model) was unable to parse, and therefore unable to evaluate and provide a metrics score.\n    - Lower score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.5`\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Most accurate, least accurate, fastest, slowest, most expensive and cheapest LLM models based on the evaluated primary metric.\n- LLM models with best and worst context retrieval performance.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n\nEvaluator **parameters**:\n\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.5`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n- `evaluate_retrieved_context` (bool):\n    - Control whether to evaluate also retrieved context - conditions to check whether it contains or does not contained specific strings.\n    - Default value: `\"\"`\n",
                    "brief_description": "Text Matching Evaluator assesses whether both\nthe retrieved context (in the case of RAG hosted models) and the generated answer\n**contain/match** a specified set of required strings. The evaluation is based on an\nboolean expression (condition) that can be used to define the required strings presence:\n\n- operands are **strings** or **regular expressions**\n- operators are `AND`, `OR`, and `NOT`\n- **parentheses** can be used to group expressions",
                    "model_types": [
                        "llm",
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "LlmEvalResultsExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        },
                        {
                            "explanation_type": "global-llm-bool-leaderboard",
                            "name": "LlmBoolLeaderboardExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        },
                        {
                            "explanation_type": "global-work-dir-archive",
                            "name": "WorkDirArchiveExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        }
                    ],
                    "keywords": [
                        "llm",
                        "evaluates_rag",
                        "evaluates_llm",
                        "requires_prompts",
                        "requires_constraints",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-ongoing-monitoring",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "nist-ai-rmf-secure-and-resilient",
                        "nist-ai-rmf-privacy-enhanced",
                        "nist-ai-rmf-fair",
                        "nist-ai-rmf-accountable-and-transparent",
                        "nist-ai-rmf-valid-and-reliable",
                        "problem-type-information-retrieval",
                        "problem-type-question-answering",
                        "problem-type-summarization",
                        "problem-type-regression",
                        "problem-type-classification",
                        "problem-type-binary-classification",
                        "problem-type-multiclass-classification",
                        "es-purpose-generation",
                        "evaluation-method-rule-based",
                        "evaluation-type-deterministic",
                        "capability-condition-highlight"
                    ],
                    "parameters": [
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.5,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "evaluate_retrieved_context",
                            "description": "Control whether to evaluate also retrieved context - conditions to check whether it contains or does not contained specific strings.",
                            "comment": "",
                            "type": "bool",
                            "val": false,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "model_passes",
                            "display_name": "Model passes",
                            "data_type": "float",
                            "display_value": ".0%",
                            "description": "Percentage of successfully evaluated RAG/LLM outputs.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.5,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "model_failures",
                            "display_name": "Model failures",
                            "data_type": "float",
                            "display_value": ".0%",
                            "description": "Percentage of RAG/LLM outputs that failed to pass the evaluator check.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": false,
                            "threshold": 0.5,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "model_retrieval_failures",
                            "display_name": "Model retrieval failures",
                            "data_type": "float",
                            "display_value": ".0%",
                            "description": "Percentage of RAG's retrieved contexts that failed to pass the evaluator check.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": false,
                            "threshold": 0.5,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "model_generation_failures",
                            "display_name": "Model generation failures",
                            "data_type": "float",
                            "display_value": ".0%",
                            "description": "Percentage of outputs generated by RAG from the retrieved contexts that failed to pass the evaluator check (equivalent to the model failures).",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": false,
                            "threshold": 0.5,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "model_parse_failures",
                            "display_name": "Model parse failures",
                            "data_type": "float",
                            "display_value": ".0%",
                            "description": "Percentage of RAG/LLM outputs that evaluator's judge (LLM, RAG, agent or model) was unable to parse, and therefore unable to evaluate and provide a metrics score.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": false,
                            "threshold": 0.5,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                },
                "created": 1769097975.7410321,
                "duration": 0.19278836250305176,
                "child_explainer_job_keys": null,
                "job_location": "/tmp/pytest-of-dvorka/pytest-0/test_evaluate_and_compare__hom4/h2o-sonar/mli_experiment_25269fb8-8cb5-4879-be2f-3570da92adc1/explainer_h2o_sonar_evaluators_rag_tokens_presence_evaluator_RagStrStrEvaluator_db54412b-4273-469c-8a59-b4f695389d02",
                "result_descriptor": {
                    "id": "h2o_sonar.evaluators.rag_tokens_presence_evaluator.RagStrStrEvaluator",
                    "name": "RagStrStrEvaluator",
                    "display_name": "Text matching",
                    "tagline": "Evaluate the presence of specific strings in the answers and retrieved contexts.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n| \u2713 |   |   |   | \u2713 |\n\n**Description**:\n\nText Matching Evaluator assesses whether both\nthe retrieved context (in the case of RAG hosted models) and the generated answer\n**contain/match** a specified set of required strings. The evaluation is based on an\nboolean expression (condition) that can be used to define the required strings presence:\n\n- operands are **strings** or **regular expressions**\n- operators are `AND`, `OR`, and `NOT`\n- **parentheses** can be used to group expressions\n\n- **Example 1: Simple string matching**\n   - Expression: `\"15,969\"`\n   - The evaluator will check if the retrieved context and the actual answer\n     contain the string `15,969`. If the condition is satisfied, the test case\n     passes.\n\n- **Example 2: Flexible regex patterns**\n   - Expression: `regexp(\"15,?969\")`\n   - What if the number `15,969` might be expressed as `15969` or `15,969`?\n     The boolean expression can be extended to use a regular expression. The\n     evaluator will check if the retrieved context and the actual answer contain\n     the string `15,969` or `15969`. If the condition is satisfied, the test\n     case passes.\n\n- **Example 3: Combining string and regex**\n   - Expression: `\"15,969\" AND regexp(\"[Mm]illion\")`\n   - The evaluator will check if the retrieved context and the actual answer\n     contain the string `15,969` **and** match the regular expression\n     `[Mm]illion`. If the condition is satisfied, the test case passes.\n\n- **Example 4: Complex boolean logic**\n   - Expression: `(\"Rio\" OR \"rio\") AND regexp(\"15,?969 [Mm]il\") AND NOT \"Real\"`\n   - The evaluator will check if the retrieved context and the actual answer\n     contain either `Rio` or `rio` **and** match the regular expression\n     `15,969 [Mm]il` **and** do not contain the string `Real`. If the\n     condition is satisfied, the test case passes.\n\n- **Example 5: Exact matching with regex anchors**\n   - Expression: `regexp(\"^Brazil revenue was 15,969 million$\")`\n   - The evaluator will check if the retrieved context and the actual answer\n     **exactly** match the regular expression\n     `^Brazil revenue was 15,969 million$`. If the condition is satisfied, the\n     test case passes.\n\n- **Example 6: Case-insensitive matching**\n   - Expression: `regexp(\"(?i)python\")`\n   - The `(?i)` flag enables case-insensitive matching. The evaluator will match\n     `python`, `Python`, `PYTHON`, `PyThOn`, etc. This is useful when the\n     capitalization in the output is unpredictable.\n\n- **Example 7: OR within regular expressions**\n   - Expression: `regexp(\"(cat|dog|bird)\")`\n   - Using the pipe `|` operator inside a group allows matching multiple\n     alternatives. The evaluator will match any of: `cat`, `dog`, or `bird`.\n     This is more concise than using multiple `OR` operators in the boolean\n     expression.\n\n- **Example 8: Capturing groups and word boundaries**\n   - Expression: `regexp(\"\\b(error|warning|failure)\\b\")`\n   - The `\\b` word boundary ensures exact word matching (not as part of a larger\n     word). The regex will match `error`, `warning`, or `failure` as complete\n     words. Parentheses capture the matched text for reference.\n\n- **Example 9: Repeated patterns and quantifiers**\n   - Expression: `regexp(\"\\d3-\\d3-\\d4\")`\n   - Quantifiers specify repetition: `\\d3` matches exactly 3 digits, `+`\n     matches one or more, `*` matches zero or more. This example matches phone\n     numbers in the format `123-456-7890`. Use `\\d` for digits, `\\w` for\n     word characters, `\\s` for whitespace.\n\n- **Example 10: Lookahead and combining patterns**\n   - Expression: `regexp(\"(?i)(success|completed).*\\d+%\")`\n   - This combines case-insensitive matching `(?i)`, an OR group\n     `(success|completed)`, `.*` to match any characters, and `\\d+%` to\n     match one or more digits followed by a percent sign. Useful for matching\n     complex patterns like progress messages.\n\n**Method**:\n\n- The evaluator parses the boolean expression and checks if the retrieved context\n  and the generated answer contain the required strings.\n- The evaluator uses Python `re` module for regular expression matching (`re.search`\n  function). See https://docs.python.org/3/howto/regex.html#regex-howto\n\n\n\n**Metrics** calculated by the evaluator:\n\n- **Model passes** (float)\n    - Percentage of successfully evaluated RAG/LLM outputs.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.5`\n    - This is **primary** metric.\n- **Model failures** (float)\n    - Percentage of RAG/LLM outputs that failed to pass the evaluator check.\n    - Lower score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.5`\n- **Model retrieval failures** (float)\n    - Percentage of RAG's retrieved contexts that failed to pass the evaluator check.\n    - Lower score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.5`\n- **Model generation failures** (float)\n    - Percentage of outputs generated by RAG from the retrieved contexts that failed to pass the evaluator check (equivalent to the model failures).\n    - Lower score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.5`\n- **Model parse failures** (float)\n    - Percentage of RAG/LLM outputs that evaluator's judge (LLM, RAG, agent or model) was unable to parse, and therefore unable to evaluate and provide a metrics score.\n    - Lower score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.5`\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Most accurate, least accurate, fastest, slowest, most expensive and cheapest LLM models based on the evaluated primary metric.\n- LLM models with best and worst context retrieval performance.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n\nEvaluator **parameters**:\n\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.5`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n- `evaluate_retrieved_context` (bool):\n    - Control whether to evaluate also retrieved context - conditions to check whether it contains or does not contained specific strings.\n    - Default value: `\"\"`\n",
                    "brief_description": "Text Matching Evaluator assesses whether both\nthe retrieved context (in the case of RAG hosted models) and the generated answer\n**contain/match** a specified set of required strings. The evaluation is based on an\nboolean expression (condition) that can be used to define the required strings presence:\n\n- operands are **strings** or **regular expressions**\n- operators are `AND`, `OR`, and `NOT`\n- **parentheses** can be used to group expressions",
                    "model_types": [
                        "llm",
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "Text Matching evaluation results",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/json",
                                "text/csv",
                                "application/vnd.h2oai.datatable.jay"
                            ]
                        },
                        {
                            "explanation_type": "global-llm-bool-leaderboard",
                            "name": "RAG benchmark leaderboard",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "text/markdown",
                                "application/vnd.h2oai-evalstudio-leaderboard.markdown",
                                "application/json"
                            ]
                        },
                        {
                            "explanation_type": "global-html-fragment",
                            "name": "RAG benchmark leaderboard as HTML",
                            "category": "COMPLIANCE TESTS",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "text/html"
                            ]
                        },
                        {
                            "explanation_type": "global-work-dir-archive",
                            "name": "Archive of Text matching artifacts",
                            "category": "COMPLIANCE TESTS",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/zip"
                            ]
                        }
                    ],
                    "keywords": [
                        "llm",
                        "evaluates_rag",
                        "evaluates_llm",
                        "requires_prompts",
                        "requires_constraints",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-ongoing-monitoring",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "nist-ai-rmf-secure-and-resilient",
                        "nist-ai-rmf-privacy-enhanced",
                        "nist-ai-rmf-fair",
                        "nist-ai-rmf-accountable-and-transparent",
                        "nist-ai-rmf-valid-and-reliable",
                        "problem-type-information-retrieval",
                        "problem-type-question-answering",
                        "problem-type-summarization",
                        "problem-type-regression",
                        "problem-type-classification",
                        "problem-type-binary-classification",
                        "problem-type-multiclass-classification",
                        "es-purpose-generation",
                        "evaluation-method-rule-based",
                        "evaluation-type-deterministic",
                        "capability-condition-highlight"
                    ],
                    "parameters": [
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.5,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "evaluate_retrieved_context",
                            "description": "Control whether to evaluate also retrieved context - conditions to check whether it contains or does not contained specific strings.",
                            "comment": "",
                            "type": "bool",
                            "val": false,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "model_passes",
                            "display_name": "Model passes",
                            "data_type": "float",
                            "display_value": ".0%",
                            "description": "Percentage of successfully evaluated RAG/LLM outputs.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.5,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "model_failures",
                            "display_name": "Model failures",
                            "data_type": "float",
                            "display_value": ".0%",
                            "description": "Percentage of RAG/LLM outputs that failed to pass the evaluator check.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": false,
                            "threshold": 0.5,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "model_retrieval_failures",
                            "display_name": "Model retrieval failures",
                            "data_type": "float",
                            "display_value": ".0%",
                            "description": "Percentage of RAG's retrieved contexts that failed to pass the evaluator check.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": false,
                            "threshold": 0.5,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "model_generation_failures",
                            "display_name": "Model generation failures",
                            "data_type": "float",
                            "display_value": ".0%",
                            "description": "Percentage of outputs generated by RAG from the retrieved contexts that failed to pass the evaluator check (equivalent to the model failures).",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": false,
                            "threshold": 0.5,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "model_parse_failures",
                            "display_name": "Model parse failures",
                            "data_type": "float",
                            "display_value": ".0%",
                            "description": "Percentage of RAG/LLM outputs that evaluator's judge (LLM, RAG, agent or model) was unable to parse, and therefore unable to evaluate and provide a metrics score.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": false,
                            "threshold": 0.5,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                }
            },
            {
                "key": "a67eaa72-9238-44a6-8473-3c9770d0b8f8",
                "progress": 1.0,
                "status": 0,
                "error": "",
                "message": "DONE",
                "explainer": {
                    "id": "h2o_sonar.evaluators.rag_chunk_relevancy_evaluator.ContextChunkRelevancyEvaluator",
                    "name": "ContextChunkRelevancyEvaluator",
                    "display_name": "Context relevancy (soft recall and precision)",
                    "tagline": "Assess precision and relevancy of the retrieved context.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n| \u2713 |   | \u2713 |   |   |\n\n**Description**:\n\nContext Relevancy Evaluator assesses the context relevancy\nin a Retrieval Augmented Generation (RAG) pipeline.\nContext Relevancy (Soft Recall and Precision) Evaluator measures the relevancy\nof the retrieved context based on the question and context sentences and produces\ntwo metrics - **precision** and **recall relevancy**.\n\n**Method**:\n\n- The evaluator brings two metrics calculated as:\n\n```math\nchunk context relevancy(ch) = max( {S(emb(q), emb(s)): for all s in ch} )\n\nrecall relevancy = max( {chunk context relevancy(ch): for all ch in rc} )\nprecision relevancy = avg( {chunk context relevancy(ch): for all ch in rc} )\n```\n\n- Where:\n    - `rc` is the retrieved context.\n    - `ch` is a chunk of the retrieved context.\n    - `emb(s)` is a vector embedding of the retrieved context chunk sentence.\n    - `emb(q)` is a vector embedding of the query.\n    - `S(question, s)` is the 1 - cosine distance between the `question` and\n      the retrieved context sentence `s`.\n- The evaluator uses **embeddings**\n  [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en) (where BGE\n  stands for \"BAAI General Embedding\" which refers to a suite of open-source text\n  embedding models developed by the Beijing Academy of Artificial Intelligence (BAAI)).\n\n\n**Metrics** calculated by the evaluator:\n\n- **Recall Relevancy** (float)\n    - Maximum retrieved context chunk relevancy.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n- **Precision Relevancy** (float)\n    - Average retrieved context chunk relevancy.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n    - This is **primary** metric.\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Best performing LLM model based on the evaluated primary metric.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n\nEvaluator **parameters**:\n\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.75`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n",
                    "brief_description": "Context Relevancy Evaluator assesses the context relevancy\nin a Retrieval Augmented Generation (RAG) pipeline.\nContext Relevancy (Soft Recall and Precision) Evaluator measures the relevancy\nof the retrieved context based on the question and context sentences and produces\ntwo metrics - **precision** and **recall relevancy**.",
                    "model_types": [
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope",
                        "local_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "LlmEvalResultsExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        },
                        {
                            "explanation_type": "global-llm-heatmap-leaderboard",
                            "name": "LlmHeatmapLeaderboardExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        },
                        {
                            "explanation_type": "global-work-dir-archive",
                            "name": "WorkDirArchiveExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        }
                    ],
                    "keywords": [
                        "hardware-gpu-optional",
                        "llm",
                        "evaluates_rag",
                        "requires_retrieved_context",
                        "requires_prompts",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "problem-type-information-retrieval",
                        "problem-type-question-answering",
                        "evaluator-role-regulator",
                        "es-purpose-generation",
                        "evaluation-method-semantic-similarity",
                        "evaluation-type-deterministic"
                    ],
                    "parameters": [
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.75,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "recall_relevancy",
                            "display_name": "Recall Relevancy",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "Maximum retrieved context chunk relevancy.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "precision_relevancy",
                            "display_name": "Precision Relevancy",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "Average retrieved context chunk relevancy.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                },
                "created": 1769097975.9408224,
                "duration": 25.914331912994385,
                "child_explainer_job_keys": null,
                "job_location": "/tmp/pytest-of-dvorka/pytest-0/test_evaluate_and_compare__hom4/h2o-sonar/mli_experiment_25269fb8-8cb5-4879-be2f-3570da92adc1/explainer_h2o_sonar_evaluators_rag_chunk_relevancy_evaluator_ContextChunkRelevancyEvaluator_a67eaa72-9238-44a6-8473-3c9770d0b8f8",
                "result_descriptor": {
                    "id": "h2o_sonar.evaluators.rag_chunk_relevancy_evaluator.ContextChunkRelevancyEvaluator",
                    "name": "ContextChunkRelevancyEvaluator",
                    "display_name": "Context relevancy (soft recall and precision)",
                    "tagline": "Assess precision and relevancy of the retrieved context.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n| \u2713 |   | \u2713 |   |   |\n\n**Description**:\n\nContext Relevancy Evaluator assesses the context relevancy\nin a Retrieval Augmented Generation (RAG) pipeline.\nContext Relevancy (Soft Recall and Precision) Evaluator measures the relevancy\nof the retrieved context based on the question and context sentences and produces\ntwo metrics - **precision** and **recall relevancy**.\n\n**Method**:\n\n- The evaluator brings two metrics calculated as:\n\n```math\nchunk context relevancy(ch) = max( {S(emb(q), emb(s)): for all s in ch} )\n\nrecall relevancy = max( {chunk context relevancy(ch): for all ch in rc} )\nprecision relevancy = avg( {chunk context relevancy(ch): for all ch in rc} )\n```\n\n- Where:\n    - `rc` is the retrieved context.\n    - `ch` is a chunk of the retrieved context.\n    - `emb(s)` is a vector embedding of the retrieved context chunk sentence.\n    - `emb(q)` is a vector embedding of the query.\n    - `S(question, s)` is the 1 - cosine distance between the `question` and\n      the retrieved context sentence `s`.\n- The evaluator uses **embeddings**\n  [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en) (where BGE\n  stands for \"BAAI General Embedding\" which refers to a suite of open-source text\n  embedding models developed by the Beijing Academy of Artificial Intelligence (BAAI)).\n\n\n**Metrics** calculated by the evaluator:\n\n- **Recall Relevancy** (float)\n    - Maximum retrieved context chunk relevancy.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n- **Precision Relevancy** (float)\n    - Average retrieved context chunk relevancy.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n    - This is **primary** metric.\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Best performing LLM model based on the evaluated primary metric.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n\nEvaluator **parameters**:\n\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.75`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n",
                    "brief_description": "Context Relevancy Evaluator assesses the context relevancy\nin a Retrieval Augmented Generation (RAG) pipeline.\nContext Relevancy (Soft Recall and Precision) Evaluator measures the relevancy\nof the retrieved context based on the question and context sentences and produces\ntwo metrics - **precision** and **recall relevancy**.",
                    "model_types": [
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope",
                        "local_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "Evaluation metrics data",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/json",
                                "text/csv",
                                "application/vnd.h2oai.datatable.jay"
                            ]
                        },
                        {
                            "explanation_type": "global-llm-heatmap-leaderboard",
                            "name": "LLM heatmap leaderboard",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/json",
                                "text/markdown",
                                "application/vnd.h2oai-evalstudio-leaderboard.markdown"
                            ]
                        },
                        {
                            "explanation_type": "global-html-fragment",
                            "name": "LLM heatmap leaderboard as HTML",
                            "category": "COMPLIANCE TESTS",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "text/html"
                            ]
                        },
                        {
                            "explanation_type": "global-work-dir-archive",
                            "name": "Archive of Context relevancy (soft recall and precision) artifacts",
                            "category": "COMPLIANCE TESTS",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/zip"
                            ]
                        }
                    ],
                    "keywords": [
                        "hardware-gpu-optional",
                        "llm",
                        "evaluates_rag",
                        "requires_retrieved_context",
                        "requires_prompts",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "problem-type-information-retrieval",
                        "problem-type-question-answering",
                        "evaluator-role-regulator",
                        "es-purpose-generation",
                        "evaluation-method-semantic-similarity",
                        "evaluation-type-deterministic"
                    ],
                    "parameters": [
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.75,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "recall_relevancy",
                            "display_name": "Recall Relevancy",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "Maximum retrieved context chunk relevancy.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": false,
                            "parent_metric": "",
                            "exclude": false
                        },
                        {
                            "key": "precision_relevancy",
                            "display_name": "Precision Relevancy",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "Average retrieved context chunk relevancy.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                }
            },
            {
                "key": "2984dc08-357c-41ee-b261-32e8871f3be0",
                "progress": 1.0,
                "status": 0,
                "error": "",
                "message": "DONE",
                "explainer": {
                    "id": "h2o_sonar.evaluators.rag_context_mean_reciprocal_rank_evaluator.MeanReciprocalRankEvaluator",
                    "name": "MeanReciprocalRankEvaluator",
                    "display_name": "Context mean reciprocal rank",
                    "tagline": "Assess mean reciprocal rank of the retrieved context.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n| \u2713 |   | \u2713 |   |   |\n\n**Description**:\n\nMean Reciprocal Rank Evaluator assesses the performance of\n    the retrieval component of a RAG system by measuring the average of the reciprocal\n    ranks of the first relevant document retrieved for a set of queries. It helps to\n    evaluate how effectively the retrieval component of a RAG system provides relevant\n    context for generating accurate and contextually appropriate responses.\n\n**Method**:\n\n- The evaluator brings mean reciprocal rank (MRR) metric.\n- Relevant retrieved context chunk is defined as the chunk that contains the answer\n  to the query. The relevance score is calculated as:\n\n```math\nrelevance score = max( S(ctx chunk sentence, query) )\n```\n\n- Where S(a, b) is the similarity score between texts a and b, calculated as\n  1 - cosine distance between their vector embeddings.\n- For a single query, the reciprocal rank is the inverse of the rank of the first\n  relevant document retrieved:\n\n```math\nreciprocal rank = 1 / rank of the first chunk with relevance score >= threshold\n```\n\n- If the first relevant document is at rank 1, the reciprocal rank is 1.0 (best\n  score). If no relevant document is retrieved, the reciprocal rank is 0.0 (worst\n  score). If the first relevant document is at rank 5, the reciprocal rank\n  is 1 / 5 i.e. 0.2.\n- Threshold for the relevance score is set to 0.7 by default, but can be\n  adjusted using the evaluator parameter.\n- Mean reciprocal rank (MRR) is the average of the reciprocal ranks across all\n  queries:\n\n```math\nmean reciprocal rank = sum(reciprocal rank for query in queries) / |queries|\n```\n\n- The evaluator uses **embeddings**\n  [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en) (where BGE\n  stands for \"BAAI General Embedding\" which refers to a suite of open-source text\n  embedding models developed by the Beijing Academy of Artificial Intelligence (BAAI)).\n\n\n**Metrics** calculated by the evaluator:\n\n- **Mean Reciprocal Rank** (float)\n    - Mean reciprocal rank metric score given the first relevant retrieved context chunk.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n    - This is **primary** metric.\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Best performing LLM model based on the evaluated primary metric.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n\nEvaluator **parameters**:\n\n- `mrr_relevant_chunk_threshold` (float):\n    - Threshold for the relevance score of the retrieved context chunk. The relevance score is calculated as: S(ctx chunk, query). The threshold value should be between 0.0 and 1.0 (default: 0.7).\n    - Default value: `0.7`\n- `mrr_relevant_chunk_oor_idx` (int):\n    - Threshold for the index of the relevant chunk in the retrieved context. If the first relevant chunk is at an index higher than this value, it is considered out of range and the reciprocal rank for that query is set to 0.0. The value should be a positive integer (default: 10).\n    - Default value: `10`\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.75`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n",
                    "brief_description": "Mean Reciprocal Rank Evaluator assesses the performance of\n    the retrieval component of a RAG system by measuring the average of the reciprocal\n    ranks of the first relevant document retrieved for a set of queries. It helps to\n    evaluate how effectively the retrieval component of a RAG system provides relevant\n    context for generating accurate and contextually appropriate responses.",
                    "model_types": [
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope",
                        "local_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "LlmEvalResultsExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        },
                        {
                            "explanation_type": "global-llm-heatmap-leaderboard",
                            "name": "LlmHeatmapLeaderboardExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        },
                        {
                            "explanation_type": "global-work-dir-archive",
                            "name": "WorkDirArchiveExplanation",
                            "category": "",
                            "scope": "global",
                            "has_local": "",
                            "formats": []
                        }
                    ],
                    "keywords": [
                        "hardware-gpu-optional",
                        "llm",
                        "evaluates_rag",
                        "requires_retrieved_context",
                        "requires_prompts",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "problem-type-information-retrieval",
                        "problem-type-question-answering",
                        "evaluator-role-regulator",
                        "es-purpose-generation",
                        "evaluation-method-semantic-similarity",
                        "evaluation-type-deterministic"
                    ],
                    "parameters": [
                        {
                            "name": "mrr_relevant_chunk_threshold",
                            "description": "Threshold for the relevance score of the retrieved context chunk. The relevance score is calculated as: S(ctx chunk, query). The threshold value should be between 0.0 and 1.0 (default: 0.7).",
                            "comment": "",
                            "type": "float",
                            "val": 0.7,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "mrr_relevant_chunk_oor_idx",
                            "description": "Threshold for the index of the relevant chunk in the retrieved context. If the first relevant chunk is at an index higher than this value, it is considered out of range and the reciprocal rank for that query is set to 0.0. The value should be a positive integer (default: 10).",
                            "comment": "",
                            "type": "int",
                            "val": 10,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.75,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "mean_reciprocal_rank",
                            "display_name": "Mean Reciprocal Rank",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "Mean reciprocal rank metric score given the first relevant retrieved context chunk.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                },
                "created": 1769098001.862311,
                "duration": 25.885531187057495,
                "child_explainer_job_keys": null,
                "job_location": "/tmp/pytest-of-dvorka/pytest-0/test_evaluate_and_compare__hom4/h2o-sonar/mli_experiment_25269fb8-8cb5-4879-be2f-3570da92adc1/explainer_h2o_sonar_evaluators_rag_context_mean_reciprocal_rank_evaluator_MeanReciprocalRankEvaluator_2984dc08-357c-41ee-b261-32e8871f3be0",
                "result_descriptor": {
                    "id": "h2o_sonar.evaluators.rag_context_mean_reciprocal_rank_evaluator.MeanReciprocalRankEvaluator",
                    "name": "MeanReciprocalRankEvaluator",
                    "display_name": "Context mean reciprocal rank",
                    "tagline": "Assess mean reciprocal rank of the retrieved context.",
                    "description": "**Evaluator input requirements**:\n\n| Question | Expected Answer | Retrieved Context | Actual Answer | Conditions  |\n| --- | --- | --- | --- | --- |\n| \u2713 |   | \u2713 |   |   |\n\n**Description**:\n\nMean Reciprocal Rank Evaluator assesses the performance of\n    the retrieval component of a RAG system by measuring the average of the reciprocal\n    ranks of the first relevant document retrieved for a set of queries. It helps to\n    evaluate how effectively the retrieval component of a RAG system provides relevant\n    context for generating accurate and contextually appropriate responses.\n\n**Method**:\n\n- The evaluator brings mean reciprocal rank (MRR) metric.\n- Relevant retrieved context chunk is defined as the chunk that contains the answer\n  to the query. The relevance score is calculated as:\n\n```math\nrelevance score = max( S(ctx chunk sentence, query) )\n```\n\n- Where S(a, b) is the similarity score between texts a and b, calculated as\n  1 - cosine distance between their vector embeddings.\n- For a single query, the reciprocal rank is the inverse of the rank of the first\n  relevant document retrieved:\n\n```math\nreciprocal rank = 1 / rank of the first chunk with relevance score >= threshold\n```\n\n- If the first relevant document is at rank 1, the reciprocal rank is 1.0 (best\n  score). If no relevant document is retrieved, the reciprocal rank is 0.0 (worst\n  score). If the first relevant document is at rank 5, the reciprocal rank\n  is 1 / 5 i.e. 0.2.\n- Threshold for the relevance score is set to 0.7 by default, but can be\n  adjusted using the evaluator parameter.\n- Mean reciprocal rank (MRR) is the average of the reciprocal ranks across all\n  queries:\n\n```math\nmean reciprocal rank = sum(reciprocal rank for query in queries) / |queries|\n```\n\n- The evaluator uses **embeddings**\n  [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en) (where BGE\n  stands for \"BAAI General Embedding\" which refers to a suite of open-source text\n  embedding models developed by the Beijing Academy of Artificial Intelligence (BAAI)).\n\n\n**Metrics** calculated by the evaluator:\n\n- **Mean Reciprocal Rank** (float)\n    - Mean reciprocal rank metric score given the first relevant retrieved context chunk.\n    - Higher score is better.\n    - Range: `[0.0, 1.0]`\n    - Default threshold: `0.75`\n    - This is **primary** metric.\n\n**Problems** reported by the evaluator:\n\n- If average score of the metric for an evaluated LLM is below the threshold, then the evaluator will report a problem for that LLM.\n- If test suite has perturbed test cases, then the evaluator will report a problem for each perturbed test case and LLM model whose metric flipped (moved above/below threshold) after perturbation.\n\n**Insights** diagnosed by the evaluator:\n\n- Best performing LLM model based on the evaluated primary metric.\n- The most difficult test case for the evaluated LLM models, i.e., the prompt, which most of the evaluated LLM models had a problem answering correctly.\n\nEvaluator **parameters**:\n\n- `mrr_relevant_chunk_threshold` (float):\n    - Threshold for the relevance score of the retrieved context chunk. The relevance score is calculated as: S(ctx chunk, query). The threshold value should be between 0.0 and 1.0 (default: 0.7).\n    - Default value: `0.7`\n- `mrr_relevant_chunk_oor_idx` (int):\n    - Threshold for the index of the relevant chunk in the retrieved context. If the first relevant chunk is at an index higher than this value, it is considered out of range and the reciprocal rank for that query is set to 0.0. The value should be a positive integer (default: 10).\n    - Default value: `10`\n- `metric_threshold` (float):\n    - Evaluated metric threshold - values below this threshold are considered problematic.\n    - Default value: `0.75`\n- `save_llm_result` (bool):\n    - Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.\n    - Default value: `True`\n",
                    "brief_description": "Mean Reciprocal Rank Evaluator assesses the performance of\n    the retrieval component of a RAG system by measuring the average of the reciprocal\n    ranks of the first relevant document retrieved for a set of queries. It helps to\n    evaluate how effectively the retrieval component of a RAG system provides relevant\n    context for generating accurate and contextually appropriate responses.",
                    "model_types": [
                        "rag"
                    ],
                    "can_explain": [],
                    "explanation_scopes": [
                        "global_scope",
                        "local_scope"
                    ],
                    "explanations": [
                        {
                            "explanation_type": "global-llm-eval-results",
                            "name": "Evaluation metrics data",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/json",
                                "text/csv",
                                "application/vnd.h2oai.datatable.jay"
                            ]
                        },
                        {
                            "explanation_type": "global-llm-heatmap-leaderboard",
                            "name": "LLM heatmap leaderboard",
                            "category": "LLM",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/json",
                                "text/markdown",
                                "application/vnd.h2oai-evalstudio-leaderboard.markdown"
                            ]
                        },
                        {
                            "explanation_type": "global-html-fragment",
                            "name": "LLM heatmap leaderboard as HTML",
                            "category": "COMPLIANCE TESTS",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "text/html"
                            ]
                        },
                        {
                            "explanation_type": "global-work-dir-archive",
                            "name": "Archive of Context mean reciprocal rank artifacts",
                            "category": "COMPLIANCE TESTS",
                            "scope": "global",
                            "has_local": null,
                            "formats": [
                                "application/zip"
                            ]
                        }
                    ],
                    "keywords": [
                        "hardware-gpu-optional",
                        "llm",
                        "evaluates_rag",
                        "requires_retrieved_context",
                        "requires_prompts",
                        "sr-11-7-conceptual-soundness",
                        "sr-11-7-outcomes-analysis",
                        "nist-ai-rmf-safe",
                        "problem-type-information-retrieval",
                        "problem-type-question-answering",
                        "evaluator-role-regulator",
                        "es-purpose-generation",
                        "evaluation-method-semantic-similarity",
                        "evaluation-type-deterministic"
                    ],
                    "parameters": [
                        {
                            "name": "mrr_relevant_chunk_threshold",
                            "description": "Threshold for the relevance score of the retrieved context chunk. The relevance score is calculated as: S(ctx chunk, query). The threshold value should be between 0.0 and 1.0 (default: 0.7).",
                            "comment": "",
                            "type": "float",
                            "val": 0.7,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "mrr_relevant_chunk_oor_idx",
                            "description": "Threshold for the index of the relevant chunk in the retrieved context. If the first relevant chunk is at an index higher than this value, it is considered out of range and the reciprocal rank for that query is set to 0.0. The value should be a positive integer (default: 10).",
                            "comment": "",
                            "type": "int",
                            "val": 10,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "metric_threshold",
                            "description": "Evaluated metric threshold - values below this threshold are considered problematic.",
                            "comment": "",
                            "type": "float",
                            "val": 0.75,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        },
                        {
                            "name": "save_llm_result",
                            "description": "Control whether to save LLM result which contains input LLM dataset and all metrics calculated by the evaluator.",
                            "comment": "",
                            "type": "bool",
                            "val": true,
                            "predefined": [],
                            "tags": [],
                            "min_": 0.0,
                            "max_": 0.0,
                            "category": ""
                        }
                    ],
                    "metrics_meta": [
                        {
                            "key": "mean_reciprocal_rank",
                            "display_name": "Mean Reciprocal Rank",
                            "data_type": "float",
                            "display_value": ".4f",
                            "description": "Mean reciprocal rank metric score given the first relevant retrieved context chunk.",
                            "value_range": [
                                0.0,
                                1.0
                            ],
                            "value_enum": null,
                            "higher_is_better": true,
                            "threshold": 0.75,
                            "is_primary_metric": true,
                            "parent_metric": "",
                            "exclude": false
                        }
                    ]
                }
            }
        ],
        "problems": [
            {
                "description": "Evaluated model claude-3-7-sonnet-20250219 failed to satisfy the threshold 0.75 for metric 'ROUGE-L', with average score 0.1759. ROUGE-L metric: ROUGE-L metric considers the longest common subsequence (LCS) between the generated and reference summaries.",
                "description_html": "Evaluated model\n<code>\n  claude-3-7-sonnet-20250219\n</code>\n failed to satisfy the \n<b class=\"w3-black\">\n  &nbsp;threshold&nbsp;\n</b>\n<code>\n  &nbsp;0.75\n</code>\n for metric \n<code>\n  ROUGE-L\n</code>\nwith average \n<b class=\"w3-black\">\n  &nbsp;score&nbsp;\n</b>\n<code>\n  &nbsp;0.1759.\n</code>\nMetric details:\n<i>\n  ROUGE-L metric considers the longest common subsequence (LCS) between the generated and reference summaries.\n</i>",
                "actions_description": "To improve summarizations, focus on three key areas:  refinement, evaluation, and training data. The LLM can be equipped with auto-refinement modules that assess its own summaries and identify areas for improvement, like missing key points. Additionally, using metrics that go beyond surface-level similarity to human-written summaries can guide the training process. Finally, incorporating diverse and high-quality summaries into the training data provides the LLM with better examples to learn from, leading to more comprehensive and informative summaries. ",
                "actions_codes": [],
                "explainer_id": "h2o_sonar.evaluators.rouge_evaluator.RougeEvaluator",
                "explainer_name": "ROUGE",
                "explanation_type": "global-html-fragment",
                "explanation_name": "GlobalHtmlFragmentExplanation",
                "explanation_mime": "text/html",
                "resources": [],
                "severity": "HIGH",
                "problem_type": "summarization",
                "problem_attrs": {
                    "model_name": "claude-3-7-sonnet-20250219",
                    "metric_id": "rouge_l",
                    "metric_name": "ROUGE-L",
                    "metric_threshold": 0.75,
                    "metric_score": 0.17594324123024582,
                    "dataset_row_keys": [
                        [
                            "3d218d65-a2e6-4de4-af99-e93bd9132440",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "3d218d65-a2e6-4de4-af99-e93bd9132440",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "3d218d65-a2e6-4de4-af99-e93bd9132440",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "85804363-a32f-452b-b58e-9b43578e25bd",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "85804363-a32f-452b-b58e-9b43578e25bd",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "85804363-a32f-452b-b58e-9b43578e25bd",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "f55701bc-6071-4bc0-8c83-6abc8199c116",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "f55701bc-6071-4bc0-8c83-6abc8199c116",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "f55701bc-6071-4bc0-8c83-6abc8199c116",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "af27e87a-fbba-43c7-96db-32821cfba555",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "af27e87a-fbba-43c7-96db-32821cfba555",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "af27e87a-fbba-43c7-96db-32821cfba555",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "496908a0-7291-48de-8e1f-51d80f3b9332",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "496908a0-7291-48de-8e1f-51d80f3b9332",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "496908a0-7291-48de-8e1f-51d80f3b9332",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "df54fbd7-1de5-4858-86b8-ef13c11544f1",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "df54fbd7-1de5-4858-86b8-ef13c11544f1",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "df54fbd7-1de5-4858-86b8-ef13c11544f1",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "35118a2b-7931-496c-afb2-5725c7509813",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "35118a2b-7931-496c-afb2-5725c7509813",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "35118a2b-7931-496c-afb2-5725c7509813",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "92df97af-de0e-493b-9700-d5d81c572a71",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "92df97af-de0e-493b-9700-d5d81c572a71",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "92df97af-de0e-493b-9700-d5d81c572a71",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "efb7b80e-8528-4cae-8395-eb3e746576e7",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "efb7b80e-8528-4cae-8395-eb3e746576e7",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "efb7b80e-8528-4cae-8395-eb3e746576e7",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "cb5a2918-f796-48c9-9456-bdc3a9790d67",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "cb5a2918-f796-48c9-9456-bdc3a9790d67",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "cb5a2918-f796-48c9-9456-bdc3a9790d67",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "18eb6562-0d5c-4196-a205-2b80ecb68738",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "18eb6562-0d5c-4196-a205-2b80ecb68738",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "18eb6562-0d5c-4196-a205-2b80ecb68738",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "4f8e0f41-2b93-4e84-b369-00766cd935c4",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "4f8e0f41-2b93-4e84-b369-00766cd935c4",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "4f8e0f41-2b93-4e84-b369-00766cd935c4",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "28e12aaa-d6a3-4e2f-bd81-de48aa225ad0",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "28e12aaa-d6a3-4e2f-bd81-de48aa225ad0",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "28e12aaa-d6a3-4e2f-bd81-de48aa225ad0",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "4ee630a0-be1f-4fec-a69d-e8d32a03e47a",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "4ee630a0-be1f-4fec-a69d-e8d32a03e47a",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "4ee630a0-be1f-4fec-a69d-e8d32a03e47a",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "e9907955-f1c9-46ae-8fa8-c82064b4ec6e",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "e9907955-f1c9-46ae-8fa8-c82064b4ec6e",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "e9907955-f1c9-46ae-8fa8-c82064b4ec6e",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "77c521ef-9607-4d9a-a2b4-5999b011436f",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "77c521ef-9607-4d9a-a2b4-5999b011436f",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "77c521ef-9607-4d9a-a2b4-5999b011436f",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "bfc85b98-9b19-49f3-9054-86cc10d7ad8e",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "bfc85b98-9b19-49f3-9054-86cc10d7ad8e",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "bfc85b98-9b19-49f3-9054-86cc10d7ad8e",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "c6d2ee62-5c24-4fea-9737-1d18d2d49034",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "c6d2ee62-5c24-4fea-9737-1d18d2d49034",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "c6d2ee62-5c24-4fea-9737-1d18d2d49034",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "77eebd7e-a1a5-4923-8fab-919808a62de3",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "77eebd7e-a1a5-4923-8fab-919808a62de3",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "77eebd7e-a1a5-4923-8fab-919808a62de3",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "748cd522-eecd-4851-b9b8-419c0493ce3d",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "748cd522-eecd-4851-b9b8-419c0493ce3d",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "748cd522-eecd-4851-b9b8-419c0493ce3d",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "42b5d4d6-c23e-453d-ba4e-9f6530d655d9",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "42b5d4d6-c23e-453d-ba4e-9f6530d655d9",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "42b5d4d6-c23e-453d-ba4e-9f6530d655d9",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "32633989-a8e5-4f69-a1bd-66cc9c6b01b3",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "32633989-a8e5-4f69-a1bd-66cc9c6b01b3",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "32633989-a8e5-4f69-a1bd-66cc9c6b01b3",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "7e333b8c-119d-417f-9ce9-a23fd4985956",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "7e333b8c-119d-417f-9ce9-a23fd4985956",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "7e333b8c-119d-417f-9ce9-a23fd4985956",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "c5d12db6-80ca-4b06-9789-c3d87b0eedcb",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "c5d12db6-80ca-4b06-9789-c3d87b0eedcb",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "c5d12db6-80ca-4b06-9789-c3d87b0eedcb",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "a9721f12-8b70-4c68-a091-e89d16921707",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "a9721f12-8b70-4c68-a091-e89d16921707",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "a9721f12-8b70-4c68-a091-e89d16921707",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "1a935eb2-02f5-4e58-9d0f-ab06600f9f30",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "1a935eb2-02f5-4e58-9d0f-ab06600f9f30",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "1a935eb2-02f5-4e58-9d0f-ab06600f9f30",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "15b46307-bef3-4519-8ca1-292459d6c32e",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "15b46307-bef3-4519-8ca1-292459d6c32e",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "15b46307-bef3-4519-8ca1-292459d6c32e",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "eed571ae-3816-4913-9dd0-d7bab6a0d286",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "eed571ae-3816-4913-9dd0-d7bab6a0d286",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "eed571ae-3816-4913-9dd0-d7bab6a0d286",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "f1eef2d7-2529-4614-afcf-6ed87be766df",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "f1eef2d7-2529-4614-afcf-6ed87be766df",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "f1eef2d7-2529-4614-afcf-6ed87be766df",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "b9d3d130-bed8-487e-a856-742f31a21a59",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "b9d3d130-bed8-487e-a856-742f31a21a59",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "b9d3d130-bed8-487e-a856-742f31a21a59",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "c1995f07-14eb-42f9-9fc8-9fd0bf6314be",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "c1995f07-14eb-42f9-9fc8-9fd0bf6314be",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "c1995f07-14eb-42f9-9fc8-9fd0bf6314be",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "ba0c7628-49e4-4724-b00c-8ac504938eee",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "ba0c7628-49e4-4724-b00c-8ac504938eee",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "ba0c7628-49e4-4724-b00c-8ac504938eee",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "070f69a5-10ac-4455-9410-21c2734b54e5",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "070f69a5-10ac-4455-9410-21c2734b54e5",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "070f69a5-10ac-4455-9410-21c2734b54e5",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "a8171d92-cba3-4da0-be2e-58e3125a3686",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "a8171d92-cba3-4da0-be2e-58e3125a3686",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "a8171d92-cba3-4da0-be2e-58e3125a3686",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "37789f65-f2f5-47a4-8cfe-65d4a4026d3e",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "37789f65-f2f5-47a4-8cfe-65d4a4026d3e",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "37789f65-f2f5-47a4-8cfe-65d4a4026d3e",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "37d5ca32-c522-432d-b3ee-c8cd7834dd8b",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "37d5ca32-c522-432d-b3ee-c8cd7834dd8b",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "37d5ca32-c522-432d-b3ee-c8cd7834dd8b",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "66f91103-68c2-4771-b445-00a5b4f07c7d",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "66f91103-68c2-4771-b445-00a5b4f07c7d",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "66f91103-68c2-4771-b445-00a5b4f07c7d",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "39f5768c-be1b-4ad3-8f63-8e3173f47cae",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "39f5768c-be1b-4ad3-8f63-8e3173f47cae",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "39f5768c-be1b-4ad3-8f63-8e3173f47cae",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "21e38b0d-51c4-4a73-b892-a28efa3fc633",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "21e38b0d-51c4-4a73-b892-a28efa3fc633",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "21e38b0d-51c4-4a73-b892-a28efa3fc633",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "e626299b-2ac3-4074-a9fa-99cf70bba15b",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "e626299b-2ac3-4074-a9fa-99cf70bba15b",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "e626299b-2ac3-4074-a9fa-99cf70bba15b",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "a527175d-5b3f-44b6-b003-f73874e0ad94",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "a527175d-5b3f-44b6-b003-f73874e0ad94",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "a527175d-5b3f-44b6-b003-f73874e0ad94",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "6c622d3f-bd6f-4bb4-80fa-1993b8cc9774",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "6c622d3f-bd6f-4bb4-80fa-1993b8cc9774",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "6c622d3f-bd6f-4bb4-80fa-1993b8cc9774",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "dd195d2f-a3af-47dd-a325-b4c215a5791d",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "dd195d2f-a3af-47dd-a325-b4c215a5791d",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "dd195d2f-a3af-47dd-a325-b4c215a5791d",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "1d5d8e96-3fbe-4e09-bece-79be8b7eb030",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "1d5d8e96-3fbe-4e09-bece-79be8b7eb030",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "1d5d8e96-3fbe-4e09-bece-79be8b7eb030",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "6b8063e9-88f6-459f-a817-0672a5d59a97",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "6b8063e9-88f6-459f-a817-0672a5d59a97",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "6b8063e9-88f6-459f-a817-0672a5d59a97",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "10ac0692-d550-4d52-ad6a-30bb81ed03ba",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "10ac0692-d550-4d52-ad6a-30bb81ed03ba",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "10ac0692-d550-4d52-ad6a-30bb81ed03ba",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "48e48953-07be-4412-888a-0e2f38bf6baa",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "48e48953-07be-4412-888a-0e2f38bf6baa",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "48e48953-07be-4412-888a-0e2f38bf6baa",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "0ba1a236-2ee4-43f1-80ff-8ffefc320524",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "0ba1a236-2ee4-43f1-80ff-8ffefc320524",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "0ba1a236-2ee4-43f1-80ff-8ffefc320524",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "56cebb9e-c35c-40e5-aa3b-32ea53e59cb9",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "56cebb9e-c35c-40e5-aa3b-32ea53e59cb9",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "56cebb9e-c35c-40e5-aa3b-32ea53e59cb9",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "e174bcd1-6a7f-4c91-86d5-c00b4c452e03",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "e174bcd1-6a7f-4c91-86d5-c00b4c452e03",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ],
                        [
                            "e174bcd1-6a7f-4c91-86d5-c00b4c452e03",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ]
                    ],
                    "test_case_keys": [
                        "3d218d65-a2e6-4de4-af99-e93bd9132440",
                        "e174bcd1-6a7f-4c91-86d5-c00b4c452e03",
                        "6c622d3f-bd6f-4bb4-80fa-1993b8cc9774",
                        "df54fbd7-1de5-4858-86b8-ef13c11544f1",
                        "dd195d2f-a3af-47dd-a325-b4c215a5791d",
                        "496908a0-7291-48de-8e1f-51d80f3b9332",
                        "7e333b8c-119d-417f-9ce9-a23fd4985956",
                        "37d5ca32-c522-432d-b3ee-c8cd7834dd8b",
                        "77c521ef-9607-4d9a-a2b4-5999b011436f",
                        "748cd522-eecd-4851-b9b8-419c0493ce3d",
                        "32633989-a8e5-4f69-a1bd-66cc9c6b01b3",
                        "cb5a2918-f796-48c9-9456-bdc3a9790d67",
                        "21e38b0d-51c4-4a73-b892-a28efa3fc633",
                        "bfc85b98-9b19-49f3-9054-86cc10d7ad8e",
                        "efb7b80e-8528-4cae-8395-eb3e746576e7",
                        "1d5d8e96-3fbe-4e09-bece-79be8b7eb030",
                        "eed571ae-3816-4913-9dd0-d7bab6a0d286",
                        "6b8063e9-88f6-459f-a817-0672a5d59a97",
                        "39f5768c-be1b-4ad3-8f63-8e3173f47cae",
                        "0ba1a236-2ee4-43f1-80ff-8ffefc320524",
                        "15b46307-bef3-4519-8ca1-292459d6c32e",
                        "ba0c7628-49e4-4724-b00c-8ac504938eee",
                        "10ac0692-d550-4d52-ad6a-30bb81ed03ba",
                        "af27e87a-fbba-43c7-96db-32821cfba555",
                        "92df97af-de0e-493b-9700-d5d81c572a71",
                        "77eebd7e-a1a5-4923-8fab-919808a62de3",
                        "f1eef2d7-2529-4614-afcf-6ed87be766df",
                        "28e12aaa-d6a3-4e2f-bd81-de48aa225ad0",
                        "37789f65-f2f5-47a4-8cfe-65d4a4026d3e",
                        "1a935eb2-02f5-4e58-9d0f-ab06600f9f30",
                        "c6d2ee62-5c24-4fea-9737-1d18d2d49034",
                        "56cebb9e-c35c-40e5-aa3b-32ea53e59cb9",
                        "48e48953-07be-4412-888a-0e2f38bf6baa",
                        "42b5d4d6-c23e-453d-ba4e-9f6530d655d9",
                        "e626299b-2ac3-4074-a9fa-99cf70bba15b",
                        "f55701bc-6071-4bc0-8c83-6abc8199c116",
                        "070f69a5-10ac-4455-9410-21c2734b54e5",
                        "66f91103-68c2-4771-b445-00a5b4f07c7d",
                        "35118a2b-7931-496c-afb2-5725c7509813",
                        "85804363-a32f-452b-b58e-9b43578e25bd",
                        "4ee630a0-be1f-4fec-a69d-e8d32a03e47a",
                        "c5d12db6-80ca-4b06-9789-c3d87b0eedcb",
                        "18eb6562-0d5c-4196-a205-2b80ecb68738",
                        "a527175d-5b3f-44b6-b003-f73874e0ad94",
                        "b9d3d130-bed8-487e-a856-742f31a21a59",
                        "c1995f07-14eb-42f9-9fc8-9fd0bf6314be",
                        "a9721f12-8b70-4c68-a091-e89d16921707",
                        "e9907955-f1c9-46ae-8fa8-c82064b4ec6e",
                        "a8171d92-cba3-4da0-be2e-58e3125a3686",
                        "4f8e0f41-2b93-4e84-b369-00766cd935c4"
                    ],
                    "evaluator_name": "ROUGE",
                    "avid_problem_code": "P0200",
                    "avid_problem_code_description": "Ability for the AI to perform as intended"
                }
            },
            {
                "description": "The least accurate sentence identified by the Answer Accuracy evaluator is: \"The document emphasizes that back-testing \"is not a straightforward, mechanical process that always produces unambiguous results\" and that \"the purpose is to test the model, not individual forecast values\" (page 15).\". Additional details - prompt: \"How should banks approach back-testing?\", LLM: claude-3-7-sonnet-20250219.",
                "description_html": "The least accurate sentence identified by the Answer Accuracy evaluator is: \n<b>\n  <i>\n    \"The document emphasizes that back-testing \"is not a straightforward, mechanical process that always produces unambiguous results\" and that \"the purpose is to test the model, not individual forecast values\" (page 15).\"\n  </i>\n</b>\n.\nAdditional details - prompt: \n<b>\n  <i>\n    \"How should banks approach back-testing?\"\n  </i>\n</b>\n, LLM: \n<code>\n  claude-3-7-sonnet-20250219\n</code>\n.",
                "actions_description": "",
                "actions_codes": [],
                "explainer_id": "h2o_sonar.evaluators.answer_accuracy_evaluator.AnswerAccuracyEvaluator",
                "explainer_name": "Answer accuracy (semantic similarity)",
                "explanation_type": "global-html-fragment",
                "explanation_name": "GlobalHtmlFragmentExplanation",
                "explanation_mime": "text/html",
                "resources": [],
                "severity": "MEDIUM",
                "problem_type": "problem",
                "problem_attrs": {
                    "dataset_row_keys": [
                        [
                            "77c521ef-9607-4d9a-a2b4-5999b011436f",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ]
                    ],
                    "test_case_keys": [
                        "77c521ef-9607-4d9a-a2b4-5999b011436f"
                    ],
                    "evaluator_name": "Answer accuracy (semantic similarity)",
                    "avid_problem_code": "P0100",
                    "avid_problem_code_description": "Problems arising due to faults in the data pipeline"
                }
            },
            {
                "description": "The least accurate sentence identified by the Answer Accuracy evaluator is: \"Specifically:\n\n1.\". Additional details - prompt: \"How often should model validation be performed?\", LLM: claude-3-7-sonnet-20250219.",
                "description_html": "The least accurate sentence identified by the Answer Accuracy evaluator is: \n<b>\n  <i>\n    \"Specifically:\n\n1.\"\n  </i>\n</b>\n.\nAdditional details - prompt: \n<b>\n  <i>\n    \"How often should model validation be performed?\"\n  </i>\n</b>\n, LLM: \n<code>\n  claude-3-7-sonnet-20250219\n</code>\n.",
                "actions_description": "",
                "actions_codes": [],
                "explainer_id": "h2o_sonar.evaluators.answer_accuracy_evaluator.AnswerAccuracyEvaluator",
                "explainer_name": "Answer accuracy (semantic similarity)",
                "explanation_type": "global-html-fragment",
                "explanation_name": "GlobalHtmlFragmentExplanation",
                "explanation_mime": "text/html",
                "resources": [],
                "severity": "MEDIUM",
                "problem_type": "problem",
                "problem_attrs": {
                    "dataset_row_keys": [
                        [
                            "c6d2ee62-5c24-4fea-9737-1d18d2d49034",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ]
                    ],
                    "test_case_keys": [
                        "c6d2ee62-5c24-4fea-9737-1d18d2d49034"
                    ],
                    "evaluator_name": "Answer accuracy (semantic similarity)",
                    "avid_problem_code": "P0100",
                    "avid_problem_code_description": "Problems arising due to faults in the data pipeline"
                }
            },
            {
                "description": "The least accurate sentence identified by the Answer Accuracy evaluator is: \"This may involve sensitivity analysis and benchmarking, especially when full access to coding and implementation details isn't available.\". Additional details - prompt: \"How should banks choose the models they use?\", LLM: claude-3-7-sonnet-20250219.",
                "description_html": "The least accurate sentence identified by the Answer Accuracy evaluator is: \n<b>\n  <i>\n    \"This may involve sensitivity analysis and benchmarking, especially when full access to coding and implementation details isn't available.\"\n  </i>\n</b>\n.\nAdditional details - prompt: \n<b>\n  <i>\n    \"How should banks choose the models they use?\"\n  </i>\n</b>\n, LLM: \n<code>\n  claude-3-7-sonnet-20250219\n</code>\n.",
                "actions_description": "",
                "actions_codes": [],
                "explainer_id": "h2o_sonar.evaluators.answer_accuracy_evaluator.AnswerAccuracyEvaluator",
                "explainer_name": "Answer accuracy (semantic similarity)",
                "explanation_type": "global-html-fragment",
                "explanation_name": "GlobalHtmlFragmentExplanation",
                "explanation_mime": "text/html",
                "resources": [],
                "severity": "MEDIUM",
                "problem_type": "problem",
                "problem_attrs": {
                    "dataset_row_keys": [
                        [
                            "42b5d4d6-c23e-453d-ba4e-9f6530d655d9",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ]
                    ],
                    "test_case_keys": [
                        "42b5d4d6-c23e-453d-ba4e-9f6530d655d9"
                    ],
                    "evaluator_name": "Answer accuracy (semantic similarity)",
                    "avid_problem_code": "P0100",
                    "avid_problem_code_description": "Problems arising due to faults in the data pipeline"
                }
            },
            {
                "description": "The least accurate sentence identified by the Answer Accuracy evaluator is: \"List of any exceptions to policy\n10.\". Additional details - prompt: \"What should be included in a bank's inventory of models in use?\", LLM: claude-3-7-sonnet-20250219.",
                "description_html": "The least accurate sentence identified by the Answer Accuracy evaluator is: \n<b>\n  <i>\n    \"List of any exceptions to policy\n10.\"\n  </i>\n</b>\n.\nAdditional details - prompt: \n<b>\n  <i>\n    \"What should be included in a bank's inventory of models in use?\"\n  </i>\n</b>\n, LLM: \n<code>\n  claude-3-7-sonnet-20250219\n</code>\n.",
                "actions_description": "",
                "actions_codes": [],
                "explainer_id": "h2o_sonar.evaluators.answer_accuracy_evaluator.AnswerAccuracyEvaluator",
                "explainer_name": "Answer accuracy (semantic similarity)",
                "explanation_type": "global-html-fragment",
                "explanation_name": "GlobalHtmlFragmentExplanation",
                "explanation_mime": "text/html",
                "resources": [],
                "severity": "MEDIUM",
                "problem_type": "problem",
                "problem_attrs": {
                    "dataset_row_keys": [
                        [
                            "0ba1a236-2ee4-43f1-80ff-8ffefc320524",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ]
                    ],
                    "test_case_keys": [
                        "0ba1a236-2ee4-43f1-80ff-8ffefc320524"
                    ],
                    "evaluator_name": "Answer accuracy (semantic similarity)",
                    "avid_problem_code": "P0100",
                    "avid_problem_code_description": "Problems arising due to faults in the data pipeline"
                }
            },
            {
                "description": "The least relevant context chunk identified by the Context relevancy (soft recall and precision) evaluator is: \"SR Letter 11-7\nAttachment\nPage 11\nKey Elements of Comprehensive\nValidation\nAn effective validation framework should include three core elements:\n\u2022\nEvaluation of conceptual soundness, including developmental evidence\n\u2022\nOngoing monitoring, including process verification and benchmarking\n\u2022\nOutcomes analysis, including back-testing\n1. Evaluation of Conceptual Soundness\nThis element involves assessing the quality of the model design and construction. It\nentails review of documentation and empirical evidence supporting the methods used and\nvariables selected for the model. Documentation and testing should convey an\nunderstanding of model limitations and assumptions. Validation should ensure that\njudgment exercised in model design and construction is well informed, carefully\nconsidered, and consistent with published research and with sound industry practice.\nDevelopmental evidence should be reviewed before a model goes into use and also as\npart of the ongoing validation process, in particular whenever there is a material change\nin the model.\n\". Additional details - prompt: \"What is informed conservatism?\", LLM: claude-3-7-sonnet-20250219.",
                "description_html": "The least relevant context chunk identified by the Context relevancy (soft recall and precision) evaluator is: \n<b>\n  <i>\n    \"SR Letter 11-7\nAttachment\nPage 11\nKey Elements of Comprehensive\nValidation\nAn effective validation framework should include three core elements:\n\u2022\nEvaluation of conceptual soundness, including developmental evidence\n\u2022\nOngoing monitoring, including process verification and benchmarking\n\u2022\nOutcomes analysis, including back-testing\n1. Evaluation of Conceptual Soundness\nThis element involves assessing the quality of the model design and construction. It\nentails review of documentation and empirical evidence supporting the methods used and\nvariables selected for the model. Documentation and testing should convey an\nunderstanding of model limitations and assumptions. Validation should ensure that\njudgment exercised in model design and construction is well informed, carefully\nconsidered, and consistent with published research and with sound industry practice.\nDevelopmental evidence should be reviewed before a model goes into use and also as\npart of the ongoing validation process, in particular whenever there is a material change\nin the model.\n\"\n  </i>\n</b>\n.\nAdditional details - prompt: \n<b>\n  <i>\n    \"What is informed conservatism?\"\n  </i>\n</b>\n, LLM: \n<code>\n  claude-3-7-sonnet-20250219\n</code>\n.",
                "actions_description": "",
                "actions_codes": [],
                "explainer_id": "h2o_sonar.evaluators.rag_chunk_relevancy_evaluator.ContextChunkRelevancyEvaluator",
                "explainer_name": "Context relevancy (soft recall and precision)",
                "explanation_type": "global-html-fragment",
                "explanation_name": "GlobalHtmlFragmentExplanation",
                "explanation_mime": "text/html",
                "resources": [],
                "severity": "MEDIUM",
                "problem_type": "problem",
                "problem_attrs": {
                    "dataset_row_keys": [
                        [
                            "85804363-a32f-452b-b58e-9b43578e25bd",
                            "7aaf642f-a2d8-4ceb-a873-05731d1d3b42"
                        ]
                    ],
                    "test_case_keys": [
                        "85804363-a32f-452b-b58e-9b43578e25bd"
                    ],
                    "evaluator_name": "Context relevancy (soft recall and precision)"
                }
            }
        ],
        "insights": [
            {
                "description": "The 'How should banks approach back-testing?' prompt was evaluated as the most difficult prompt to be correctly answered according to Answer accuracy (semantic similarity) evaluator. Focus on three key areas: training data quality, model fine-tuning, and output validation. First, ensure training data includes diverse, high-quality examples that cover the expected answer patterns. Second, fine-tune the model on task-specific data to improve alignment with expected outputs. Third, implement output validation mechanisms to verify generated answers against expected criteria.",
                "description_html": "Prompt \n<b>\n  <i>\n    'How should banks approach back-testing?'\n  </i>\n</b>\n&nbsp; was evaluated as \n<b class=\"w3-black\">\n  &nbsp;the most difficult prompt&nbsp;\n</b>\n&nbsp; to be correctly answered by evaluated \n&nbsp; models according to\n<code>\n  Answer accuracy (semantic similarity)\n</code>\n evaluator. \nFocus on three key areas: training data quality, model fine-tuning, and output validation. First, ensure training data includes diverse, high-quality examples that cover the expected answer patterns. Second, fine-tune the model on task-specific data to improve alignment with expected outputs. Third, implement output validation mechanisms to verify generated answers against expected criteria.",
                "actions_description": "Refer to the explanation for the detailed description of questions and answers by evaluated models in order to identify weaknesses and strengths.",
                "actions_codes": [],
                "explainer_id": "h2o_sonar.evaluators.answer_accuracy_evaluator.AnswerAccuracyEvaluator",
                "explainer_name": "Answer accuracy (semantic similarity)",
                "explanation_type": "global-html-fragment",
                "explanation_name": "GlobalHtmlFragmentExplanation",
                "explanation_mime": "text/html",
                "resources": [],
                "insight_type": "accuracy",
                "insight_attrs": {}
            },
            {
                "description": "The 'How can model risk be managed?' prompt was evaluated as the most difficult prompt to be correctly answered according to ROUGE evaluator. ",
                "description_html": "Prompt \n<b>\n  <i>\n    'How can model risk be managed?'\n  </i>\n</b>\n&nbsp; was evaluated as \n<b class=\"w3-black\">\n  &nbsp;the most difficult prompt&nbsp;\n</b>\n&nbsp; to be correctly answered by evaluated \n&nbsp; models according to\n<code>\n  ROUGE\n</code>\n evaluator. \n",
                "actions_description": "Refer to the explanation for the detailed description of questions and answers by evaluated models in order to identify weaknesses and strengths.",
                "actions_codes": [],
                "explainer_id": "h2o_sonar.evaluators.rouge_evaluator.RougeEvaluator",
                "explainer_name": "ROUGE",
                "explanation_type": "global-html-fragment",
                "explanation_name": "GlobalHtmlFragmentExplanation",
                "explanation_mime": "text/html",
                "resources": [],
                "insight_type": "summarization",
                "insight_attrs": {}
            },
            {
                "description": "Prompt 'How can model risk be managed?' is the most difficult prompt to be correctly answered according to Text matching evaluator.",
                "description_html": "Prompt \n<b>\n  <i>\n    'How can model risk be managed?'\n  </i>\n</b>\n&nbsp; was evaluated as \n<b class=\"w3-black\">\n  &nbsp;the most \n  difficult\n  &nbsp;\n</b>\n&nbsp; prompt to be correctly answered according to \n<code>\n  Text matching\n</code>\n evaluator.",
                "actions_description": "A detailed description of the failures, questions and answers to identify the weaknesses and strengths of the model and their root causes can be found in the explanation. Check the prompt, expected answer and condition - are they correct? Check models answers in failed cases and look for a common denominator and/or root cause of these failures.",
                "actions_codes": [],
                "explainer_id": "h2o_sonar.evaluators.rag_tokens_presence_evaluator.RagStrStrEvaluator",
                "explainer_name": "Text matching",
                "explanation_type": "global-html-fragment",
                "explanation_name": "GlobalHtmlFragmentExplanation",
                "explanation_mime": "text/html",
                "resources": [],
                "insight_type": "weak-point",
                "insight_attrs": {}
            }
        ],
        "overall_result": "high_severity_problems",
        "results_location": "/tmp/pytest-of-dvorka/pytest-0/test_evaluate_and_compare__hom4",
        "interpretation_location": "/tmp/pytest-of-dvorka/pytest-0/test_evaluate_and_compare__hom4/h2o-sonar/mli_experiment_25269fb8-8cb5-4879-be2f-3570da92adc1"
    }
}