[Performance] Solve high memory usage issue during model compilation using OpenVINO backend on Keras 3 #31482

Mohamed-Ashraf273 · 2025-07-27T12:20:21Z

Solving Issue #31390, and back to #30934
Adding EinsumDecomposition to MOC transformations helped reduce memory usage during model compilation.
Running this script using memory profiling form #31516:
Use keras source https://github.com/keras-team/keras.git
Also use this PR from keras_hub: keras-team/keras-hub#2350
Then run the following script.
Then Enable os.environ["OV_ENABLE_MEMORY_PROFILING"] = "1" by uncommentinng it.

import os
backend = "openvino"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["KERAS_BACKEND"] = backend
# os.environ["OV_ENABLE_MEMORY_PROFILING"] = "1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import warnings
warnings.filterwarnings("ignore")

import gc
import time
import psutil
import threading

try:
    from tabulate import tabulate
    TABULATE_AVAILABLE = True
except ImportError:
    TABULATE_AVAILABLE = False
    print("⚠️  tabulate not available. Install with: pip install tabulate")


def record_stage(stage_name, description=""):
    """Record stage with current memory consumption"""
    gc.collect()
    process = psutil.Process(os.getpid())
    mem_info = process.memory_full_info()
    current_memory = mem_info.rss / (1024 ** 2)
    swap_memory = mem_info.swap / (1024 ** 2)
    print(f"[STAGE] {stage_name}: {current_memory:.2f} MB (swap: {swap_memory:.2f} MB) - {description}")
    return current_memory, swap_memory


def main():
    """Main test function"""

    print("=" * 80)
    print(f"FIXED MEMORY TEST: KERAS GPT2 + {backend.upper()}")
    print("=" * 80)

    # Now import keras and keras_hub
    import keras
    import keras_hub

    # Global variables for memory monitoring
    process = psutil.Process(os.getpid())
    peak_memory = [0]
    peak_swap = [0]
    done = [False]

    def monitor_memory():
        """Continuous memory monitoring"""
        while not done[0]:
            mem_info = process.memory_full_info()
            mem_now = mem_info.rss / (1024 ** 2)
            swap_now = mem_info.swap / (1024 ** 2)
            if mem_now > peak_memory[0]:
                peak_memory[0] = mem_now
            if swap_now > peak_swap[0]:
                peak_swap[0] = swap_now
            time.sleep(0.02)

    # Stage 0: Initial state
    mem_initial, swap_initial = record_stage("0_INITIAL", "Initial state after imports")
    peak_memory[0] = mem_initial
    peak_swap[0] = swap_initial

    # Start monitoring
    monitor_thread = threading.Thread(target=monitor_memory, daemon=True)
    monitor_thread.start()

    # Stage 1: Model loading
    print("\n>>> Loading GPT2 model from preset...")
    start_load = time.perf_counter()

    try:
        causal_lm = keras_hub.models.GPT2CausalLM.from_preset("gpt2_medium_en", dtype="float32")
        model_name = "gpt2_medium_en"

        end_load = time.perf_counter()
        mem_after_load, swap_after_load = record_stage("1_MODEL_LOADED", 
                                    f"{model_name} model loaded ({end_load-start_load:.1f}s)")
    except Exception as e:
        print(f"❌ Model loading error: {e}")
        return False

    # Stage 2: Preparation for inference
    mem_before_inference, swap_before_inference = record_stage("2_BEFORE_INFERENCE", "Before first inference")

    # Stage 3: First inference
    print("\n>>> Running first inference (compilation + execution)...")
    print(f"    ⏳ Converting Keras -> {backend.upper()} and compiling...")

    start_time = time.perf_counter()

    try:
        # Try inference with temporary ellipsis fix during generation only
        import numpy as np
        
        # Method 1: Try simple generate call
        inference_success = False
        try:
            output = causal_lm.generate("Hello", max_length=10)
            generation_method = "generate"
            inference_success = True
        except Exception as e1:
            print(f"Generate failed: {str(e1)[:100]}")
            # Method 2: Try using backbone directly for memory test
            try:
                import tensorflow as tf
                # Create simple input tensor
                input_tokens = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.int32)
                # Get backbone prediction to trigger backend compilation
                logits = causal_lm.backbone(input_tokens)
                output = f"Direct backbone inference: shape {logits.shape}"
                generation_method = "backbone"
                inference_success = True
            except Exception as e2:
                print(f"Backbone inference failed: {str(e2)[:100]}")
                # Method 3: Just compile the model without full inference
                try:
                    # Force model build and compilation
                    causal_lm.backbone.build((None, None))
                    output = "Model build and compile successful"
                    generation_method = "compile_only"
                    # Do NOT set inference_success = True here!
                except Exception as e3:
                    print(f"Model compile failed: {str(e3)[:100]}")
                    raise e1  # Re-raise original error
        end_time = time.perf_counter()
        if generation_method == "compile_only" and not inference_success:
            mem_after_inference, swap_after_inference = record_stage("3_FIRST_INFERENCE", 
                                              f"First inference failed via compile_only ({end_time-start_time:.1f}s)")
        else:
            mem_after_inference, swap_after_inference = record_stage("3_FIRST_INFERENCE", 
                                              f"First inference completed via {generation_method} ({end_time-start_time:.1f}s)")
        # Stage 4: Second inference
        print("\n>>> Second inference (no compilation)...")
        start_time2 = time.perf_counter()
        try:
            if generation_method == "generate":
                output2 = causal_lm.generate("Test", max_length=10)
            elif generation_method == "backbone":
                logits2 = causal_lm.backbone(input_tokens)
                output2 = f"Second backbone inference: shape {logits2.shape}"
            else:
                output2 = "Second compile test successful"
        except:
            output2 = f"Second {generation_method} inference"
        end_time2 = time.perf_counter()
        mem_after_second, swap_after_second = record_stage("4_SECOND_INFERENCE", 
                                    f"Second inference ({end_time2-start_time2:.1f}s)")
        
    except Exception as e:
        print(f"❌ All inference methods failed: {e}")
        import traceback
        print(f"Error details: {traceback.format_exc()}")
        mem_after_inference, swap_after_inference = record_stage("3_INFERENCE_FAILED", f"All inference failed: {str(e)[:50]}...")
        mem_after_second, swap_after_second = mem_after_inference, swap_after_inference
        output = "FAILED"
        output2 = "FAILED"
        inference_success = False
        end_time = start_time
        end_time2 = start_time

    # Stop monitoring
    done[0] = True
    monitor_thread.join(timeout=1.0)

    # Final stage
    mem_final, swap_final = record_stage("5_FINAL", "Final state")

    # Results analysis
    print("\n" + "=" * 80)
    print("PERFORMANCE RESULTS")
    print("=" * 80)

    if inference_success:
        latency = end_time - start_time
        latency2 = end_time2 - start_time2
        tokens_generated = len(output.split()) if output != "FAILED" else 0
        throughput = tokens_generated / latency if latency > 0 else 0
        
        print(f"✅ Generated text: '{output}'")
        print(f"✅ Second generation: '{output2}'")
        print(f"Backend: {keras.backend.backend()}")
        print(f"First inference latency: {latency:.2f}s")
        print(f"Second inference latency: {latency2:.3f}s") 
        print(f"Throughput: {throughput:.2f} tokens/sec")
        print(f"Speedup: {latency/latency2:.1f}x" if latency2 > 0 else "Speedup: N/A")
    else:
        print("❌ Inference failed")

    # Memory analysis
    model_loading = mem_after_load - mem_initial
    compilation = mem_after_inference - mem_before_inference if inference_success else 0
    total_usage = mem_final - mem_initial
    peak_usage = peak_memory[0] - mem_initial

    # Calculate swap changes
    swap_model_loading = swap_after_load - swap_initial
    swap_compilation = swap_after_inference - swap_before_inference if inference_success else 0
    swap_total = swap_final - swap_initial
    peak_swap_usage = peak_swap[0] - swap_initial

    print(f"\n📊 DETAILED MEMORY ANALYSIS:")
    
    if TABULATE_AVAILABLE:
        # Create table data
        table_data = []
        
        # Initial row
        table_data.append(["Initial", f"{mem_initial:.1f}", f"{swap_initial:.1f}", "-", "-"])
        
        # After model load
        table_data.append(["After model load", f"{mem_after_load:.1f}", f"{swap_after_load:.1f}", 
                          f"{model_loading:+.1f}", f"{swap_model_loading:+.1f}"])
        
        # Before inference
        table_data.append(["Before inference", f"{mem_before_inference:.1f}", f"{swap_before_inference:.1f}", 
                          f"{mem_before_inference-mem_after_load:+.1f}", f"{swap_before_inference-swap_after_load:+.1f}"])
        
        if inference_success:
            # After 1st inference
            table_data.append(["After 1st inference", f"{mem_after_inference:.1f}", f"{swap_after_inference:.1f}", 
                              f"{compilation:+.1f}", f"{swap_compilation:+.1f}"])
            
            # After 2nd inference
            table_data.append(["After 2nd inference", f"{mem_after_second:.1f}", f"{swap_after_second:.1f}", 
                              f"{mem_after_second-mem_after_inference:+.1f}", f"{swap_after_second-swap_after_inference:+.1f}"])
            
            # Final
            table_data.append(["Final", f"{mem_final:.1f}", f"{swap_final:.1f}", 
                              f"{mem_final-mem_after_second:+.1f}", f"{swap_final-swap_after_second:+.1f}"])
        else:
            # After failure
            table_data.append(["After failure", f"{mem_after_inference:.1f}", f"{swap_after_inference:.1f}", 
                              f"{compilation:+.1f}", f"{swap_compilation:+.1f}"])
            
            # Final
            table_data.append(["Final", f"{mem_final:.1f}", f"{swap_final:.1f}", 
                              f"{mem_final-mem_after_inference:+.1f}", f"{swap_final-swap_after_inference:+.1f}"])
        
        # Peak recorded
        table_data.append(["Peak recorded", f"{peak_memory[0]:.1f}", f"{peak_swap[0]:.1f}", f"{peak_usage:+.1f}", f"{peak_swap_usage:+.1f}"])
        
        # Headers
        headers = ["STAGE", "RAM (MB)", "SWAP (MB)", "RAM CHANGE", "SWAP CHANGE"]
        
        # Print beautiful table
        print(tabulate(table_data, headers=headers, tablefmt="grid", stralign="left", numalign="right"))
        
    else:
        # Fallback to manual formatting if tabulate not available
        print(f"{'='*85}")
        print(f"{'STAGE':<25} {'RAM (MB)':<12} {'SWAP (MB)':<12} {'RAM CHANGE':<12} {'SWAP CHANGE':<12}")
        print(f"{'-'*85}")
        print(f"{'Initial':<25} {mem_initial:<12.1f} {swap_initial:<12.1f} {'-':<12} {'-':<12}")
        print(f"{'After model load':<25} {mem_after_load:<12.1f} {swap_after_load:<12.1f} {model_loading:+12.1f} {swap_model_loading:+12.1f}")
        print(f"{'Before inference':<25} {mem_before_inference:<12.1f} {swap_before_inference:<12.1f} {mem_before_inference-mem_after_load:+12.1f} {swap_before_inference-swap_after_load:+12.1f}")

        if inference_success:
            print(f"{'After 1st inference':<25} {mem_after_inference:<12.1f} {swap_after_inference:<12.1f} {compilation:+12.1f} {swap_compilation:+12.1f}")
            print(f"{'After 2nd inference':<25} {mem_after_second:<12.1f} {swap_after_second:<12.1f} {mem_after_second-mem_after_inference:+12.1f} {swap_after_second-swap_after_inference:+12.1f}")
        else:
            print(f"{'After failure':<25} {mem_after_inference:<12.1f} {swap_after_inference:<12.1f} {compilation:+12.1f} {swap_compilation:+12.1f}")

        print(f"{'Final':<25} {mem_final:<12.1f} {swap_final:<12.1f} {mem_final-mem_after_second if inference_success else mem_final-mem_after_inference:+12.1f} {swap_final-(swap_after_second if inference_success else swap_after_inference):+12.1f}")
        print(f"{'Peak recorded':<25} {peak_memory[0]:<12.1f} {peak_swap[0]:<12.1f} {peak_usage:+12.1f} {peak_swap_usage:+12.1f}")
        print(f"{'-'*85}")

    print(f"\n🔍 MAIN MEMORY CONSUMERS:")
    print(f"   📚 Model loading:        {model_loading:+8.1f} MB RAM  {swap_model_loading:+8.1f} MB swap  ({model_loading/total_usage*100 if total_usage != 0 else 0:.1f}% of total)")
    if inference_success and compilation != 0:
        print(f"   ⚡ Compilation/inference: {compilation:+8.1f} MB RAM  {swap_compilation:+8.1f} MB swap  ({compilation/total_usage*100 if total_usage != 0 else 0:.1f}% of total)")

    print(f"\n📈 SUMMARY:")
    print(f"   💾 Total RAM growth:     {total_usage:+8.1f} MB")
    print(f"   💿 Total swap change:    {swap_total:+8.1f} MB")
    print(f"   📊 Peak RAM consumption: {peak_usage:+8.1f} MB above initial")
    print(f"   🔥 Highest RAM recorded: {peak_memory[0]:.1f} MB")
    print(f"   💿 Peak swap consumption: {peak_swap_usage:+8.1f} MB above initial")
    print(f"   🔥 Highest swap recorded: {peak_swap[0]:.1f} MB")

    # Enhanced status assessment
    total_memory_impact = peak_memory[0] + peak_swap[0]
    print(f"\n🎯 MEMORY HEALTH CHECK:")
    if peak_usage > 2000:
        print(f"   ❌ CRITICAL: RAM usage {peak_usage:.0f} MB is very high (target <1GB)")
    elif peak_usage > 1000:
        print(f"   ⚠️  WARNING: RAM usage {peak_usage:.0f} MB is quite high")
    else:
        print(f"   ✅ GOOD: RAM usage {peak_usage:.0f} MB is reasonable")
    
    if peak_swap[0] > 1000:
        print(f"   ⚠️  WARNING: Peak swap usage {peak_swap[0]:.0f} MB indicates memory pressure")
    elif peak_swap[0] > 100:
        print(f"   ℹ️  INFO: Moderate peak swap usage {peak_swap[0]:.0f} MB")
    else:
        print(f"   ✅ GOOD: Low peak swap usage {peak_swap[0]:.0f} MB")

    if total_memory_impact > 4000:
        print(f"   🚨 ALERT: Combined memory impact {total_memory_impact:.0f} MB is very high")

    return {
        'success': inference_success,
        'model_loading_mb': model_loading,
        'compilation_mb': compilation,
        'total_mb': total_usage,
        'peak_mb': peak_usage,
        'peak_swap_mb': peak_swap_usage
    }

try:
    results = main()
    print(f"\n🎯 Test completed: {results}")
except Exception as e:
    print(f"\n❌ Critical error: {e}")
    import traceback
    print(traceback.format_exc())

without fix:

[MEMORY] PassManager CPU:DecompressionHandling started: 2225.85 MB
[MEMORY] ov::pass::InitNodeInfo: 2225.85 -> 2228.95 MB (+3.09375 MB) [NO CHANGE]
[MEMORY] ov::pass::MarkShapeOfSubgraphs: 2229.1 -> 2229.1 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CPU:DecompressionHandling finished: 2229.1 MB (total: +3.25 MB)
[MEMORY] PassManager Plugin:CPU started: 2229.57 MB
[MEMORY] PassManager SDPASubgraphFusion started: 2230.04 MB
[MEMORY] PassManager SDPASubgraphFusion finished: 2230.2 MB (total: +0.15625 MB)
[MEMORY] PassManager CommonOptimizations started: 2230.66 MB
[MEMORY] PassManager MOC started: 2233.63 MB
[MEMORY] EliminateConvert: 2234.1 -> 2234.1 MB (+0 MB) [CHANGED]
[MEMORY] EliminateLoopInputsOutputs: 2234.1 -> 2234.1 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::ConstantFolding: 2234.1 -> 2452.45 MB (+218.344 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph started: 2452.45 MB
[MEMORY] ov::pass::SharedOpOptimization: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::NopElimination: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph finished: 2452.45 MB (total: +0 MB)
[MEMORY] PassManager StridedSliceOptimization started: 2452.45 MB
[MEMORY] PassManager StridedSliceOptimization finished: 2452.45 MB (total: +0 MB)
[MEMORY] ov::pass::GraphRewrite: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::LinOpSequenceFusion: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] PassManager ReverseInputChannelsFusion started: 2452.45 MB
[MEMORY] PassManager ReverseInputChannelsFusion finished: 2452.45 MB (total: +0 MB)
[MEMORY] PassManager Symbolic started: 2452.45 MB
[MEMORY] ov::pass::SymbolicPropagation: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::OptimizeSymbolsUsedAsValues: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] ReshapeOptimizations: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph started: 2452.45 MB
[MEMORY] ov::pass::SharedOpOptimization: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph finished: 2452.45 MB (total: +0 MB)
[MEMORY] PassManager Symbolic finished: 2452.45 MB (total: +0 MB)
[MEMORY] ov::pass::SymbolicOptimizations: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] PassManager MOC finished: 2452.45 MB (total: +218.812 MB)
[MEMORY] ov::pass::MOCTransformations: 2230.66 -> 2452.45 MB (+221.781 MB) [NO CHANGE]
[MEMORY] ov::pass::MarkDividesInShapeSubgraphs: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::CommonDecompositions: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::ConstantFolding: 2452.45 -> 3208.88 MB (+756.43 MB) [CHANGED]
[MEMORY] ConvertSoftMax8ToSoftMax1: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::StridesOptimization: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CommonOptimizations finished: 3208.88 MB (total: +978.211 MB)
[MEMORY] ov::pass::CommonOptimizations: 2230.2 -> 3208.88 MB (+978.68 MB) [NO CHANGE]
[MEMORY] PassManager ConvertOpSet2ToOpSet1 started: 3208.88 MB
[MEMORY] PassManager ConvertOpSet2ToOpSet1 finished: 3208.88 MB (total: +0 MB)
[MEMORY] ConvertShapeOf3: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Plugin:CPU finished: 3208.88 MB (total: +979.305 MB)
[MEMORY] PassManager CPU:PostLPT started: 3208.88 MB
[MEMORY] ConvertBroadcast3: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::UnrollTensorIterator: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::MoveEltwiseUpThroughDataMov: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic started: 3208.88 MB
[MEMORY] ov::pass::SymbolicPropagation: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic finished: 3208.88 MB (total: +0 MB)
[MEMORY] ov::pass::RoPEFusion: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic started: 3208.88 MB
[MEMORY] ov::pass::SymbolicPropagation: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic finished: 3208.88 MB (total: +0 MB)
[MEMORY] ov::pass::SymbolicOptimizations: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CPU:PostLPT finished: 3208.88 MB (total: +0 MB)
[MEMORY] PassManager CPU:Snippets started: 3208.88 MB
[MEMORY] ov::intel_cpu::SnippetsMarkSkipped: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Snippets:Tokenization started: 3208.88 MB
[MEMORY] ov::snippets::pass::EnumerateNodes: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Snippets:Tokenization finished: 3209.06 MB (total: +0.1875 MB)
[MEMORY] PassManager CPU:Snippets finished: 3209.06 MB (total: +0.1875 MB)
[MEMORY] PassManager CPU:PostSnippets started: 3209.06 MB
[MEMORY] PassManager CPU:PostSnippets finished: 3209.06 MB (total: +0 MB)
[MEMORY] PassManager CPU:ConvertToCPUSpecificOpset started: 3209.06 MB
[MEMORY] ConvertMatMulToFC: 3209.06 -> 3209.06 MB (+0 MB) [CHANGED]
[MEMORY] FullyConnectedBiasFusion: 3209.06 -> 3209.06 MB (+0 MB) [CHANGED]
[MEMORY] ConvertToPowerStatic: 3209.25 -> 3209.25 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CPU:ConvertToCPUSpecificOpset finished: 3209.25 MB (total: +0.1875 MB)

with fix
by adding:

os.environ["OV__ENABLE_EINSUM_DECOMPOSITION"] = "1"

[MEMORY] PassManager CPU:DecompressionHandling started: 2226.22 MB
[MEMORY] ov::pass::InitNodeInfo: 2226.22 -> 2230.22 MB (+4 MB) [NO CHANGE]
[MEMORY] ov::pass::MarkShapeOfSubgraphs: 2230.22 -> 2230.22 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CPU:DecompressionHandling finished: 2230.22 MB (total: +4 MB)
[MEMORY] PassManager Plugin:CPU started: 2230.22 MB
[MEMORY] KeepDecompressionsInFP32Matcher: 2230.22 -> 2232.22 MB (+2 MB) [NO CHANGE]
[MEMORY] PassManager SDPASubgraphFusion started: 2232.22 MB
[MEMORY] PassManager SDPASubgraphFusion finished: 2232.22 MB (total: +0 MB)
[MEMORY] PassManager CommonOptimizations started: 2232.22 MB
[MEMORY] PassManager MOC started: 2234.22 MB
[MEMORY] ov::pass::InitNodeInfo: 2234.22 -> 2236.22 MB (+2 MB) [NO CHANGE]
[MEMORY] EliminateConvert: 2236.22 -> 2236.22 MB (+0 MB) [CHANGED]
[MEMORY] EliminateLoopInputsOutputs: 2236.22 -> 2236.22 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::ConstantFolding: 2236.22 -> 2452.5 MB (+216.281 MB) [CHANGED]
[MEMORY] EinsumDecomposition: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph started: 2452.5 MB
[MEMORY] ov::pass::SharedOpOptimization: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::NopElimination: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph finished: 2452.5 MB (total: +0 MB)
[MEMORY] PassManager StridedSliceOptimization started: 2452.5 MB
[MEMORY] PassManager StridedSliceOptimization finished: 2452.5 MB (total: +0 MB)
[MEMORY] ov::pass::ConstantFolding: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::GraphRewrite: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::LinOpSequenceFusion: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager ReverseInputChannelsFusion started: 2452.5 MB
[MEMORY] PassManager ReverseInputChannelsFusion finished: 2452.5 MB (total: +0 MB)
[MEMORY] ov::pass::SharedOpOptimization: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic started: 2452.5 MB
[MEMORY] ov::pass::SymbolicPropagation: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::OptimizeSymbolsUsedAsValues: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ReshapeOptimizations: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph started: 2452.5 MB
[MEMORY] ov::pass::SharedOpOptimization: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph finished: 2452.5 MB (total: +0 MB)
[MEMORY] PassManager Symbolic finished: 2452.5 MB (total: +0 MB)
[MEMORY] ov::pass::SymbolicOptimizations: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager MOC finished: 2452.5 MB (total: +218.281 MB)
[MEMORY] ov::pass::MOCTransformations: 2232.22 -> 2452.5 MB (+220.281 MB) [NO CHANGE]
[MEMORY] ov::pass::MarkDividesInShapeSubgraphs: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::CommonDecompositions: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::ConstantFolding: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ConvertSoftMax8ToSoftMax1: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::StridesOptimization: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CommonOptimizations finished: 2452.5 MB (total: +220.281 MB)
[MEMORY] ov::pass::CommonOptimizations: 2232.22 -> 2452.5 MB (+220.281 MB) [NO CHANGE]
[MEMORY] PassManager ConvertOpSet2ToOpSet1 started: 2452.5 MB
[MEMORY] PassManager ConvertOpSet2ToOpSet1 finished: 2452.5 MB (total: +0 MB)
[MEMORY] ConvertShapeOf3: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Plugin:CPU finished: 2452.5 MB (total: +222.281 MB)
[MEMORY] PassManager CPU:PostLPT started: 2452.5 MB
[MEMORY] ConvertBroadcast3: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::UnrollTensorIterator: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::MoveEltwiseUpThroughDataMov: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic started: 2452.5 MB
[MEMORY] ov::pass::SymbolicPropagation: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic finished: 2452.5 MB (total: +0 MB)
[MEMORY] ov::pass::RoPEFusion: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic started: 2452.5 MB
[MEMORY] ov::pass::SymbolicPropagation: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic finished: 2452.5 MB (total: +0 MB)
[MEMORY] ov::pass::SymbolicOptimizations: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CPU:PostLPT finished: 2452.5 MB (total: +0 MB)
[MEMORY] PassManager CPU:Snippets started: 2452.5 MB
[MEMORY] ov::intel_cpu::SnippetsMarkSkipped: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Snippets:Tokenization started: 2452.5 MB
[MEMORY] ov::snippets::pass::EnumerateNodes: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Snippets:Tokenization finished: 2452.5 MB (total: +0 MB)
[MEMORY] PassManager CPU:Snippets finished: 2452.5 MB (total: +0 MB)
[MEMORY] PassManager CPU:PostSnippets started: 2452.5 MB
[MEMORY] PassManager CPU:PostSnippets finished: 2452.5 MB (total: +0 MB)
[MEMORY] PassManager CPU:ConvertToCPUSpecificOpset started: 2452.5 MB
[MEMORY] ConvertMatMulToFC: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] FullyConnectedBiasFusion: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ConvertToPowerStatic: 2452.7 -> 2452.7 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CPU:ConvertToCPUSpecificOpset finished: 2452.7 MB (total: +0.195312 MB)

Note: the order of its postion is important.
I am still exploring what else can help reduce memory usage further. I would appreciate any suggestions or recommendations.

…sing OpenVINO backend on Keras 3

Mohamed-Ashraf273 · 2025-07-28T20:15:12Z

Here is the link for the IR without EinsumDecomposition:
https://drive.google.com/drive/folders/1QADUgXWYS8cddXsnyiy74GGMn5ShzbDG?usp=sharing

Mohamed-Ashraf273 · 2025-08-01T21:37:09Z

@evkotov
@CuriousPanCake
@mvafin
@rkazants
Can you take a look?

Mohamed-Ashraf273 · 2025-08-02T10:44:59Z

@mlukasze
@p-wysocki

Mohamed-Ashraf273 · 2025-08-03T11:08:30Z

@itikhono
@evkotov
@CuriousPanCake
Could you please take a look?

praasz · 2025-08-04T07:50:46Z

build_jenkins

Mohamed-Ashraf273 · 2025-08-05T12:50:03Z

@CuriousPanCake
@mvafin
Could you please take a look?
Thanks!

Mohamed-Ashraf273 · 2025-08-08T11:44:03Z

@CuriousPanCake
@praasz
@evkotov
@itikhono
@mvafin

mvafin · 2025-08-08T13:18:16Z

src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp

+    // the order is important
+    const char* enable_einsum = std::getenv("OV_ENABLE_EINSUM_DECOMPOSITION");
+    if (enable_einsum) {
+        REGISTER_PASS(manager, EinsumDecomposition)


I don't think this is a good way to fix this. Doing this in MOC means we will have decomposed einsum in IR.

As I understand this is really needed only for einsum that have constant inputs to constant fold it before reaching plugin. Can we do it differently? Maybe modify this transformation to work only on constant inputs for offline step? @CuriousPanCake

@mvafin
I updated it to check if at least one of the inputs is a constant, and it worked too.

from:

================================================================================ FIXED MEMORY TEST: KERAS GPT2 + OPENVINO ================================================================================ [STAGE] 0_INITIAL: 775.24 MB (swap: 0.00 MB) - Initial state after imports >>> Loading GPT2 model from preset... [STAGE] 1_MODEL_LOADED: 2314.67 MB (swap: 0.00 MB) - gpt2_medium_en model loaded (10.0s) [STAGE] 2_BEFORE_INFERENCE: 2314.67 MB (swap: 0.00 MB) - Before first inference >>> Running first inference (compilation + execution)... ⏳ Converting Keras -> OPENVINO and compiling... [STAGE] 3_FIRST_INFERENCE: 4512.82 MB (swap: 0.00 MB) - First inference completed via generate (7.7s) >>> Second inference (no compilation)... [STAGE] 4_SECOND_INFERENCE: 4510.38 MB (swap: 0.00 MB) - Second inference (2.0s) [STAGE] 5_FINAL: 4510.38 MB (swap: 0.00 MB) - Final state ================================================================================ PERFORMANCE RESULTS ================================================================================ ✅ Generated text: 'Hello everyone, We've been busy' ✅ Second generation: 'Testimony before the House Judiciary Committee on April' Backend: openvino First inference latency: 7.69s Second inference latency: 2.045s Throughput: 0.65 tokens/sec Speedup: 3.8x 📊 DETAILED MEMORY ANALYSIS: +---------------------+------------+-------------+--------------+---------------+ | STAGE | RAM (MB) | SWAP (MB) | RAM CHANGE | SWAP CHANGE | +=====================+============+=============+==============+===============+ | Initial | 775.2 | 0 | - | - | +---------------------+------------+-------------+--------------+---------------+ | After model load | 2314.7 | 0 | +1539.4 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | Before inference | 2314.7 | 0 | +0.0 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | After 1st inference | 4512.8 | 0 | +2198.1 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | After 2nd inference | 4510.4 | 0 | -2.4 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | Final | 4510.4 | 0 | +0.0 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | Peak recorded | 4522.9 | 0 | +3747.7 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ 🔍 MAIN MEMORY CONSUMERS: 📚 Model loading: +1539.4 MB RAM +0.0 MB swap (41.2% of total) ⚡ Compilation/inference: +2198.1 MB RAM +0.0 MB swap (58.9% of total) 📈 SUMMARY: 💾 Total RAM growth: +3735.1 MB 💿 Total swap change: +0.0 MB 📊 Peak RAM consumption: +3747.7 MB above initial 🔥 Highest RAM recorded: 4522.9 MB 💿 Peak swap consumption: +0.0 MB above initial 🔥 Highest swap recorded: 0.0 MB 🎯 MEMORY HEALTH CHECK: ❌ CRITICAL: RAM usage 3748 MB is very high (target <1GB) ✅ GOOD: Low peak swap usage 0 MB 🚨 ALERT: Combined memory impact 4523 MB is very high 🎯 Test completed: {'success': True, 'model_loading_mb': 1539.4296875, 'compilation_mb': 2198.1484375, 'total_mb': 3735.13671875, 'peak_mb': 3747.6640625, 'peak_swap_mb': 0.0}

to

[STAGE] 0_INITIAL: 781.90 MB (swap: 0.00 MB) - Initial state after imports >>> Loading GPT2 model from preset... [STAGE] 1_MODEL_LOADED: 2321.91 MB (swap: 0.00 MB) - gpt2_medium_en model loaded (13.4s) [STAGE] 2_BEFORE_INFERENCE: 2321.91 MB (swap: 0.00 MB) - Before first inference >>> Running first inference (compilation + execution)... ⏳ Converting Keras -> OPENVINO and compiling... [STAGE] 3_FIRST_INFERENCE: 3548.79 MB (swap: 0.00 MB) - First inference completed via generate (7.6s) >>> Second inference (no compilation)... [STAGE] 4_SECOND_INFERENCE: 3546.42 MB (swap: 0.00 MB) - Second inference (2.7s) [STAGE] 5_FINAL: 3546.42 MB (swap: 0.00 MB) - Final state ================================================================================ PERFORMANCE RESULTS ================================================================================ ✅ Generated text: 'Hello! I'm a student studying computer programming' ✅ Second generation: 'Testimonials I was a new' Backend: openvino First inference latency: 7.62s Second inference latency: 2.673s Throughput: 0.92 tokens/sec Speedup: 2.9x 📊 DETAILED MEMORY ANALYSIS: +---------------------+------------+-------------+--------------+---------------+ | STAGE | RAM (MB) | SWAP (MB) | RAM CHANGE | SWAP CHANGE | +=====================+============+=============+==============+===============+ | Initial | 781.9 | 0 | - | - | +---------------------+------------+-------------+--------------+---------------+ | After model load | 2321.9 | 0 | +1540.0 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | Before inference | 2321.9 | 0 | +0.0 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | After 1st inference | 3548.8 | 0 | +1226.9 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | After 2nd inference | 3546.4 | 0 | -2.4 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | Final | 3546.4 | 0 | +0.0 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | Peak recorded | 3567.8 | 0 | +2785.9 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ 🔍 MAIN MEMORY CONSUMERS: 📚 Model loading: +1540.0 MB RAM +0.0 MB swap (55.7% of total) ⚡ Compilation/inference: +1226.9 MB RAM +0.0 MB swap (44.4% of total) 📈 SUMMARY: 💾 Total RAM growth: +2764.5 MB 💿 Total swap change: +0.0 MB 📊 Peak RAM consumption: +2785.9 MB above initial 🔥 Highest RAM recorded: 3567.8 MB 💿 Peak swap consumption: +0.0 MB above initial 🔥 Highest swap recorded: 0.0 MB 🎯 MEMORY HEALTH CHECK: ❌ CRITICAL: RAM usage 2786 MB is very high (target <1GB) ✅ GOOD: Low peak swap usage 0 MB 🎯 Test completed: {'success': True, 'model_loading_mb': 1540.0078125, 'compilation_mb': 1226.88671875, 'total_mb': 2764.5234375, 'peak_mb': 2785.86328125, 'peak_swap_mb': 0.0}

mvafin · 2025-08-11T10:58:58Z

src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp

@@ -163,7 +164,8 @@ bool ov::pass::MOCTransformations::run_on_model(const std::shared_ptr<ov::Model>
    REGISTER_PASS(manager, PushConstantToSubgraph)
    REGISTER_PASS(manager, ConstantFolding)
    REGISTER_PASS(manager, Validate)
-
+    // the order is important


Please add a better comment before which transformation it should be called

mvafin · 2025-08-11T11:00:18Z

src/common/transformations/src/transformations/op_conversions/einsum_decomposition.cpp

+        if (m_check_const) {
+            bool has_const = false;
+            for (auto& input : einsum_node->input_values()) {
+                auto node_ptr = input.get_node_shared_ptr();
+                auto constant_ptr = ov::as_type_ptr<ov::op::v0::Constant>(node_ptr);
+                if (constant_ptr) {
+                    has_const = true;
+                    break;
+                }
+            }
+            if (!has_const)
+                return false;
+        }


Could you provide more detains about the einsum operation you want to optimize? Maybe link to a code of the model or a picture of subgraph

This optimization targets specific Einsum operations in transformer models like GPT-2, where at least one input is a constant tensor. After ConstantFolding, weight matrices become constants enabling more efficient decomposition patterns.

Specific Einsum Operations Being Optimized:

1. Query-Key Attention Scores Computation:

Location: https://github.com/keras-team/keras/blob/master/keras/src/layers/attention/multi_head_attention.py#L493

Pattern: einsum("aecd,abcd->acbe", key, query)

Code: attention_scores = ops.einsum(self._dot_product_equation, key, query)

2. Attention-Value Combination:

Location: https://github.com/keras-team/keras/blob/master/keras/src/layers/attention/multi_head_attention.py#L509-L511

Pattern: einsum("acbe,aecd->abcd", attention_scores, value)

Code: attention_output = ops.einsum(self._combine_equation, final_attn_scores, value)

3. Weight Matrix Projections (Q/K/V Transformations):

Location: https://github.com/keras-team/keras/blob/master/keras/src/layers/core/einsum_dense.py#L214

Pattern: einsum("abc,cd->abd", input, weight_matrix)

Code: x = ops.einsum(self.equation, inputs, self.kernel)

Optimization Application:

Note: The optimization is only applied when at least one einsum input is constant. In the examples above:

✅ Weight Matrix Projections (example 3): weight_matrix becomes constant after ConstantFolding → Optimization Applied

❌ Attention Scores (examples 1&2): Both key and query are variable tensors → No Optimization

For more details and examples visit:
https://gist.github.com/Mohamed-Ashraf273/59eddcd120918cb0761ffa5020800d5d

[Performance]solve high memory usage issue during model compilation u…

86d7685

…sing OpenVINO backend on Keras 3

github-actions bot added the category: transformations OpenVINO Runtime library - Transformations label Jul 27, 2025

sys-openvino-ci added the ExternalPR External contributor label Jul 27, 2025

Mohamed-Ashraf273 changed the title ~~[Performance] Solve high me mory usage issue during model compilation using OpenVINO backend on Keras 3~~ [Performance] Solve high memory usage issue during model compilation using OpenVINO backend on Keras 3 Jul 27, 2025

Mohamed-Ashraf273 marked this pull request as ready for review July 27, 2025 12:24

Mohamed-Ashraf273 requested a review from a team as a code owner July 27, 2025 12:24

Mohamed-Ashraf273 requested review from itikhono and removed request for a team July 27, 2025 12:24

Mohamed-Ashraf273 mentioned this pull request Jul 29, 2025

[Performance] High Memory Usage During GPT-2 Generation Using OpenVINO Backend on Keras 3 Compared to other backends #31390

Open

3 tasks

make einsum optional

126b77d

Mohamed-Ashraf273 force-pushed the optimize_model_compilation branch from 0174a70 to 126b77d Compare August 1, 2025 21:51

fix typo

4628d6f

praasz assigned CuriousPanCake Aug 4, 2025

praasz requested review from CuriousPanCake and evkotov August 4, 2025 07:49

praasz assigned evkotov Aug 4, 2025

mvafin reviewed Aug 8, 2025

View reviewed changes

Mohamed-Ashraf273 added 2 commits August 9, 2025 02:12

update einsum to check const

569ea46

fix clang

f1fef2e

mvafin reviewed Aug 11, 2025

View reviewed changes

add a detailed comment

f5dd8f1

Mohamed-Ashraf273 force-pushed the optimize_model_compilation branch from a5001be to f5dd8f1 Compare August 11, 2025 13:15

add explaination

cebca9e

Mohamed-Ashraf273 force-pushed the optimize_model_compilation branch from 55c03e6 to cebca9e Compare August 11, 2025 15:21

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Performance] Solve high memory usage issue during model compilation using OpenVINO backend on Keras 3 #31482

[Performance] Solve high memory usage issue during model compilation using OpenVINO backend on Keras 3 #31482

Mohamed-Ashraf273 commented Jul 27, 2025 •

edited

Loading

Uh oh!

Mohamed-Ashraf273 commented Jul 28, 2025

Uh oh!

Mohamed-Ashraf273 commented Aug 1, 2025 •

edited

Loading

Uh oh!

Mohamed-Ashraf273 commented Aug 2, 2025

Uh oh!

Mohamed-Ashraf273 commented Aug 3, 2025

Uh oh!

praasz commented Aug 4, 2025

Uh oh!

Mohamed-Ashraf273 commented Aug 5, 2025

Uh oh!

Mohamed-Ashraf273 commented Aug 8, 2025

Uh oh!

mvafin Aug 8, 2025

Uh oh!

Mohamed-Ashraf273 Aug 8, 2025

Uh oh!

Mohamed-Ashraf273 Aug 8, 2025

Uh oh!

mvafin Aug 11, 2025

Uh oh!

Mohamed-Ashraf273 Aug 11, 2025

Uh oh!

mvafin Aug 11, 2025

Uh oh!

Mohamed-Ashraf273 Aug 11, 2025 •

edited

Loading

Uh oh!

Uh oh!

[Performance] Solve high memory usage issue during model compilation using OpenVINO backend on Keras 3 #31482

Are you sure you want to change the base?

[Performance] Solve high memory usage issue during model compilation using OpenVINO backend on Keras 3 #31482

Conversation

Mohamed-Ashraf273 commented Jul 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Mohamed-Ashraf273 commented Jul 28, 2025

Uh oh!

Mohamed-Ashraf273 commented Aug 1, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Mohamed-Ashraf273 commented Aug 2, 2025

Uh oh!

Mohamed-Ashraf273 commented Aug 3, 2025

Uh oh!

praasz commented Aug 4, 2025

Uh oh!

Mohamed-Ashraf273 commented Aug 5, 2025

Uh oh!

Mohamed-Ashraf273 commented Aug 8, 2025

Uh oh!

mvafin Aug 8, 2025

Choose a reason for hiding this comment

Uh oh!

Mohamed-Ashraf273 Aug 8, 2025

Choose a reason for hiding this comment

Uh oh!

Mohamed-Ashraf273 Aug 8, 2025

Choose a reason for hiding this comment

Uh oh!

mvafin Aug 11, 2025

Choose a reason for hiding this comment

Uh oh!

Mohamed-Ashraf273 Aug 11, 2025

Choose a reason for hiding this comment

Uh oh!

mvafin Aug 11, 2025

Choose a reason for hiding this comment

Uh oh!

Mohamed-Ashraf273 Aug 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Specific Einsum Operations Being Optimized:

Optimization Application:

Uh oh!

Uh oh!

Mohamed-Ashraf273 commented Jul 27, 2025 •

edited

Loading

Mohamed-Ashraf273 commented Aug 1, 2025 •

edited

Loading

Mohamed-Ashraf273 Aug 11, 2025 •

edited

Loading