update lm-eval to 0.4.3

changwangss · changwangss · commit b0344a1abc63 · 2024-07-08T20:05:02.000-07:00
Signed-off-by: changwangss &lt;chang1.wang@intel.com&gt;
diff --git a/.github/workflows/script/formatScan/pylint.sh b/.github/workflows/script/formatScan/pylint.sh
@@ -28,7 +28,7 @@ else
     echo "Not found requirements.txt file."
 fi
 # install packages
-pip install lm-eval==0.4.2
+pip install lm-eval==0.4.3
 pip install accelerate nlpaug nltk schema optimum-intel optimum peft
 pip install --upgrade --force-reinstall transformers==4.36.2
 pip install optimum-habana
diff --git a/examples/huggingface/neural_speed/requirements.txt b/examples/huggingface/neural_speed/requirements.txt
@@ -1,6 +1,6 @@
 intel_extension_for_transformers
 neural-speed
-lm-eval==0.4.2
+lm-eval==0.4.3
 sentencepiece
 gguf
 --extra-index-url https://download.pytorch.org/whl/cpu
diff --git a/examples/huggingface/pytorch/language-modeling/inference/requirements.txt b/examples/huggingface/pytorch/language-modeling/inference/requirements.txt
@@ -1,4 +1,4 @@
 transformers 
 accelerate
 sentencepiece != 0.1.92
-lm-eval==0.4.2
+lm-eval==0.4.3
diff --git a/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt b/examples/huggingface/pytorch/language-modeling/pruning/requirements.txt
@@ -7,5 +7,5 @@ transformers
 torch==2.0.1
 tqdm
 neural_compressor
-lm-eval==0.4.2
+lm-eval==0.4.3
 
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt b/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt
@@ -9,5 +9,5 @@ wandb
 einops
 neural-compressor
 pytest==8.0.0
-lm-eval==0.4.2
+lm-eval==0.4.3
 git+https://github.com/huggingface/peft.git@6c44096c7b8d55a2ecf24be9bc68393467e1584a
diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt
@@ -14,4 +14,4 @@ tiktoken  #qwen
 einops  #qwen
 auto-round
 git+https://github.com/intel/neural-compressor.git
-lm-eval==0.4.2
+lm-eval==0.4.3
diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements_cpu_woq.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements_cpu_woq.txt
@@ -13,5 +13,5 @@ einops  #qwen
 git+https://github.com/intel/neural-speed.git
 auto-round==0.2
 git+https://github.com/intel/neural-compressor.git
-lm-eval==0.4.2
+lm-eval==0.4.3
 huggingface_hub
diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt
@@ -13,5 +13,5 @@ transformers_stream_generator
 tiktoken  #qwen
 einops  #qwen
 git+https://github.com/intel/neural-compressor.git
-lm-eval==0.4.2
+lm-eval==0.4.3
 huggingface_hub
diff --git a/examples/huggingface/pytorch/text2text-generation/requirements.txt b/examples/huggingface/pytorch/text2text-generation/requirements.txt
@@ -11,4 +11,4 @@ neural-compressor
 optimum-intel > 1.12.0
 onnxruntime
 intel-extension-for-pytorch
-lm-eval==0.4.2
+lm-eval==0.4.3
diff --git a/examples/modelscope/requirements.txt b/examples/modelscope/requirements.txt
@@ -1,6 +1,6 @@
 intel_extension_for_transformers
 neural-speed
-lm-eval==0.4.2
+lm-eval==0.4.3
 sentencepiece
 gguf
 --extra-index-url https://download.pytorch.org/whl/cpu
diff --git a/intel_extension_for_transformers/neural_chat/requirements_cpu.txt b/intel_extension_for_transformers/neural_chat/requirements_cpu.txt
@@ -7,7 +7,7 @@ fastapi
 fschat==0.2.32
 huggingface_hub
 intel_extension_for_pytorch==2.3.0
-lm-eval==0.4.2
+lm-eval==0.4.3
 neural-compressor
 neural_speed==1.0a0
 numpy==1.23.5
diff --git a/intel_extension_for_transformers/neural_chat/requirements_hpu.txt b/intel_extension_for_transformers/neural_chat/requirements_hpu.txt
@@ -4,7 +4,7 @@ evaluate
 fastapi
 fschat==0.2.35
 huggingface_hub
-lm-eval==0.4.2
+lm-eval==0.4.3
 neural-compressor
 numpy==1.23.5
 optimum
diff --git a/intel_extension_for_transformers/neural_chat/requirements_win.txt b/intel_extension_for_transformers/neural_chat/requirements_win.txt
@@ -6,7 +6,7 @@ fastapi
 fschat==0.2.35
 huggingface_hub
 intel-extension-for-transformers
-lm-eval==0.4.2
+lm-eval==0.4.3
 neural-compressor
 numpy==1.23.5
 optimum
diff --git a/intel_extension_for_transformers/neural_chat/tests/requirements.txt b/intel_extension_for_transformers/neural_chat/tests/requirements.txt
@@ -38,7 +38,7 @@ langchain-community==0.0.27
 langchain_core==0.1.35
 langid
 librosa
-lm-eval==0.4.2
+lm-eval==0.4.3
 markdown
 neural-compressor
 neural_speed==1.0a0
diff --git a/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/accuracy.py b/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/accuracy.py
@@ -43,7 +43,7 @@
 from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval.evaluator import(
     request_caching_arg_to_dict
 )
-from lm_eval.logging_utils import WandbLogger
+from lm_eval.loggers import WandbLogger
 from lm_eval.tasks import TaskManager
 from lm_eval.utils import make_table, simple_parse_args_string
 
diff --git a/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py b/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py
@@ -38,7 +38,7 @@
     print_writeout,
     run_task_tests,
 )
-from lm_eval.logging_utils import add_env_info, get_git_commit_hash
+from lm_eval.loggers import add_env_info, get_git_commit_hash
 from lm_eval.tasks import TaskManager, get_task_dict
 from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string
 from lm_eval import utils
@@ -509,9 +509,14 @@ def evaluate(
         # aggregate results ; run bootstrap CIs
         for task_output in eval_tasks:
             task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
-        results, samples, configs, versions, num_fewshot = consolidate_results(
-            eval_tasks
-        )
+        (
+            results,
+            samples,
+            configs,
+            versions,
+            num_fewshot,
+            higher_is_better,
+        ) = consolidate_results(eval_tasks)
 
         ### Calculate group metrics ###
         if bool(results):
@@ -522,6 +527,23 @@ def evaluate(
                     # or `task_name: []`.
                     # we only want to operate on groups here.
                     continue
+
+                # collect all higher_is_better values for metrics
+                # in the group's subtasks.
+                # TODO: clean this up ; unify with the below metric_list loop?
+                _higher_is_better = {}
+                for task in task_list:
+                    for m, h in higher_is_better[task].items():
+                        if m not in _higher_is_better.keys():
+                            _higher_is_better[m] = h
+                    if m in _higher_is_better and _higher_is_better[m] is not None and _higher_is_better[m] != h:
+                        eval_logger.warning(
+                            f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
+                        )
+                        _higher_is_better[m] = None
+                higher_is_better[group] = _higher_is_better
+
+                # collect all metric keys used by a subtask in the group.
                 metric_list = list(
                     {
                         key
@@ -534,38 +556,20 @@ def evaluate(
                     stderr = "_stderr,".join(metric.split(","))
 
                     # gather metrics, sizes, and stderrs from subtasks
-                    metrics = [
-                        results[task][metric]
-                        for task in task_list
-                        if metric in results[task]
-                    ]  # TODO: copy?
-                    stderrs = [
-                        results[task][stderr]
-                        for task in task_list
-                        if stderr in results[task]
-                    ]
-                    sizes = [
-                        results[task]["samples"]
-                        for task in task_list
-                        if metric in results[task]
-                    ]
+                    metrics = [results[task][metric] for task in task_list if metric in results[task]]  # TODO: copy?
+                    stderrs = [results[task][stderr] for task in task_list if stderr in results[task]]
+                    sizes = [results[task]["samples"] for task in task_list if metric in results[task]]
 
                     # compute group's pooled metric and stderr
-                    results[group][metric] = (
-                        lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
-                    )
+                    results[group][metric] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
                     # TODO: calculate grouped metric using aggregation fn
                     if "N/A" in stderrs:
                         results[group][stderr] = "N/A"
                     else:
-                        results[group][stderr] = (
-                            lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
-                        )
+                        results[group][stderr] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
                         # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
-                        # To use the old (likely incorrect) variance formula,
-                        # comment out the above and uncomment this line:
-                        # results[group][stderr] = \
-                        # lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
+                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
+                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
 
                     results[group]["samples"] = sum(sizes)
 
@@ -578,19 +582,15 @@ def evaluate(
             if len(left_tasks_list) == 0:
                 break
 
-            _task_hierarchy = {
-                k: v for k, v in task_hierarchy.items() if k in left_tasks_list
-            }
+            _task_hierarchy = {k: v for k, v in task_hierarchy.items() if k in left_tasks_list}
             _results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
 
             results_agg = {**results_agg, **_results_agg}
             groups_agg = {**groups_agg, **_groups_agg}
 
         for group_name, task_list in task_hierarchy.items():
             if task_list:
-                num_fewshot[group_name] = num_fewshot[
-                    task_list[0]
-                ]  # TODO: validate this
+                num_fewshot[group_name] = num_fewshot[task_list[0]]  # TODO: validate this
 
         results_dict = {
             "results": dict(results_agg.items()),
@@ -599,6 +599,17 @@ def evaluate(
             "configs": dict(sorted(configs.items())),
             "versions": dict(sorted(versions.items())),
             "n-shot": dict(sorted(num_fewshot.items())),
+            "higher_is_better": dict(sorted(higher_is_better.items())),
+            "n-samples": {
+                task_output.task_name: {
+                    "original": len(task_output.task.eval_docs),
+                    "effective": min(
+                        limit if limit else len(task_output.task.eval_docs),
+                        len(task_output.task.eval_docs),
+                    ),
+                }
+                for task_output in eval_tasks
+            },
         }
         if log_samples:
             results_dict["samples"] = dict(samples)
@@ -608,7 +619,6 @@ def evaluate(
     else:
         return None
 
-
 def request_caching_arg_to_dict(cache_requests: str) -> dict:
     request_caching_args = {
         "cache_requests": cache_requests in {"true", "refresh"},
diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -12,7 +12,7 @@ git+https://github.com/intel/neural-compressor.git
 git+https://github.com/intel/neural-speed.git
 intel-extension-for-pytorch==2.3.0
 intel-tensorflow==2.14.0
-lm-eval==0.4.2
+lm-eval==0.4.3
 mlflow
 nlpaug==1.1.9
 onnx