Merge branch 'main' into wangchang/lm-eval

changwangss · web-flow · commit e74eb8551a44 · 2024-07-10T09:13:02.000+08:00
diff --git a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md
@@ -40,8 +40,9 @@ pip install -v .
 # install requirements
 cd examples/huggingface/pytorch/text-generation/quantization
 pip install -r requirements.txt
-pip install neural-compressor==2.5
-pip install transformers==4.35.2
+pip install neural-compressor==2.6
+pip install transformers==4.38.1
+
 pip install torch==2.3.0+cpu --index-url https://download.pytorch.org/whl/cpu
 pip install intel-extension-for-pytorch==2.3.0
 ```
@@ -221,10 +222,11 @@ python run_generation_sq.py \
     --calib_len 2048 \
     --fallback_add \
     --calib_shuffle False \
+    --calib_iters 512 \
     --tasks lambada_openai \
     --int8 --sq --accuracy \
     --batch_size 1 \
-    --recipes "{'smooth_quant': True, 'smooth_quant_args': {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, 'auto_alpha_args': {'alpha_min': 0.8, 'alpha_max': 0.99, 'alpha_step': 0.01, 'shared_criterion': 'mean'}}}"
+    --recipes "{'smooth_quant': True, 'smooth_quant_args': {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, 'auto_alpha_args': {'alpha_min': 0.79, 'alpha_max': 0.99, 'alpha_step': 0.01, 'shared_criterion': 'mean'}}}"
 ```
 
 ### Weight-Only Quantization
@@ -276,11 +278,12 @@ python run_generation_sq.py \
     --trust_remote_code \
     --calib_len 1024 \
     --fallback_add \
+    --calib_iters 512
     --calib_padding \
     --tasks lambada_openai \
     --int8 --sq --accuracy \
     --batch_size 1 \
-    --recipes "{'smooth_quant': True, 'smooth_quant_args': {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, 'auto_alpha_args': {'alpha_min': 0.75, 'alpha_max': 0.99, 'alpha_step': 0.01, 'shared_criterion': 'max'}}}"
+    --recipes "{'smooth_quant': True, 'smooth_quant_args': {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, 'auto_alpha_args': {'alpha_min': 0.75, 'alpha_max': 0.99, 'alpha_step': 0.01, 'shared_criterion': 'max', 'n_samples':64}}}"
 ```
 
 ### Weight-Only Quantization
@@ -544,7 +547,7 @@ python run_generation_sq.py \
     --tasks lambada_openai \
     --int8 --sq --accuracy \
     --batch_size 1 \
-    --alpha 0.65
+    --alpha 1.0
 ```
 
 ### Weight-Only Quantization
@@ -650,8 +653,10 @@ python run_generation_sq.py \
     --trust_remote_code \
     --tasks lambada_openai \
     --int8 --sq --accuracy \
+    --calib_iters 512
     --batch_size 1 \
-    --alpha 0.75
+    --recipes "{'smooth_quant':True,'smooth_quant_args':{'alpha':'auto','folding':False,'default_alpha':0.7,'auto_alpha_args':{'alpha_min':0.55,'alpha_max':0.8,'alpha_step':0.01,'shared_criterion':'mean','n_samples':64}}}" \
+    --calib_iters 512
 ```
 
 ### Weight-Only Quantization
@@ -702,8 +707,8 @@ python run_generation_sq.py \
     --trust_remote_code \
     --tasks lambada_openai \
     --int8 --sq --accuracy \
-    --batch_size 1 \
-    --alpha 0.9
+    --recipes "{'smooth_quant':True,'smooth_quant_args':{'alpha':'auto','folding':False,'default_alpha':0.85,'auto_alpha_args':{'alpha_min':0.79,'alpha_max':0.88,'alpha_step':0.01,'shared_criterion':'mean'}}}" \
+    --batch_size 1
 ```
 
 ### Weight-Only Quantization