1
1
import copy
2
2
import json
3
3
import logging
4
+ import threading
4
5
import time
5
6
from functools import singledispatch
6
7
from typing import List , Optional , Union
@@ -269,7 +270,8 @@ async def _handle_request(span, kwargs, instance):
269
270
MessageEvent (
270
271
content = message .get ("content" ),
271
272
role = message .get ("role" ),
272
- tool_calls = _parse_tool_calls (message .get ("tool_calls" , None )),
273
+ tool_calls = _parse_tool_calls (
274
+ message .get ("tool_calls" , None )),
273
275
)
274
276
)
275
277
else :
@@ -292,6 +294,7 @@ def _handle_response(
292
294
choice_counter = None ,
293
295
duration_histogram = None ,
294
296
duration = None ,
297
+ is_streaming : bool = False ,
295
298
):
296
299
if is_openai_v1 ():
297
300
response_dict = model_as_dict (response )
@@ -306,6 +309,7 @@ def _handle_response(
306
309
duration_histogram ,
307
310
response_dict ,
308
311
duration ,
312
+ is_streaming ,
309
313
)
310
314
311
315
# span attributes
@@ -323,13 +327,19 @@ def _handle_response(
323
327
324
328
325
329
def _set_chat_metrics (
326
- instance , token_counter , choice_counter , duration_histogram , response_dict , duration
330
+ instance ,
331
+ token_counter ,
332
+ choice_counter ,
333
+ duration_histogram ,
334
+ response_dict ,
335
+ duration ,
336
+ is_streaming : bool = False ,
327
337
):
328
338
shared_attributes = metric_shared_attributes (
329
339
response_model = response_dict .get ("model" ) or None ,
330
340
operation = "chat" ,
331
341
server_address = _get_openai_base_url (instance ),
332
- is_streaming = False ,
342
+ is_streaming = is_streaming ,
333
343
)
334
344
335
345
# token metrics
@@ -420,7 +430,8 @@ async def _set_prompts(span, messages):
420
430
content = json .dumps (content )
421
431
_set_span_attribute (span , f"{ prefix } .content" , content )
422
432
if msg .get ("tool_call_id" ):
423
- _set_span_attribute (span , f"{ prefix } .tool_call_id" , msg .get ("tool_call_id" ))
433
+ _set_span_attribute (
434
+ span , f"{ prefix } .tool_call_id" , msg .get ("tool_call_id" ))
424
435
tool_calls = msg .get ("tool_calls" )
425
436
if tool_calls :
426
437
for i , tool_call in enumerate (tool_calls ):
@@ -476,9 +487,11 @@ def _set_completions(span, choices):
476
487
_set_span_attribute (span , f"{ prefix } .role" , message .get ("role" ))
477
488
478
489
if message .get ("refusal" ):
479
- _set_span_attribute (span , f"{ prefix } .refusal" , message .get ("refusal" ))
490
+ _set_span_attribute (
491
+ span , f"{ prefix } .refusal" , message .get ("refusal" ))
480
492
else :
481
- _set_span_attribute (span , f"{ prefix } .content" , message .get ("content" ))
493
+ _set_span_attribute (
494
+ span , f"{ prefix } .content" , message .get ("content" ))
482
495
483
496
function_call = message .get ("function_call" )
484
497
if function_call :
@@ -533,7 +546,8 @@ def _set_streaming_token_metrics(
533
546
# If API response doesn't have usage, fallback to tiktoken calculation
534
547
if prompt_usage == - 1 or completion_usage == - 1 :
535
548
model_name = (
536
- complete_response .get ("model" ) or request_kwargs .get ("model" ) or "gpt-4"
549
+ complete_response .get ("model" ) or request_kwargs .get (
550
+ "model" ) or "gpt-4"
537
551
)
538
552
539
553
# Calculate prompt tokens if not available from API
@@ -543,7 +557,8 @@ def _set_streaming_token_metrics(
543
557
if msg .get ("content" ):
544
558
prompt_content += msg .get ("content" )
545
559
if model_name and should_record_stream_token_usage ():
546
- prompt_usage = get_token_count_from_string (prompt_content , model_name )
560
+ prompt_usage = get_token_count_from_string (
561
+ prompt_content , model_name )
547
562
548
563
# Calculate completion tokens if not available from API
549
564
if completion_usage == - 1 and complete_response .get ("choices" ):
@@ -566,7 +581,8 @@ def _set_streaming_token_metrics(
566
581
** shared_attributes ,
567
582
SpanAttributes .LLM_TOKEN_TYPE : "input" ,
568
583
}
569
- token_counter .record (prompt_usage , attributes = attributes_with_token_type )
584
+ token_counter .record (
585
+ prompt_usage , attributes = attributes_with_token_type )
570
586
571
587
if isinstance (completion_usage , int ) and completion_usage >= 0 :
572
588
attributes_with_token_type = {
@@ -619,11 +635,34 @@ def __init__(
619
635
self ._time_of_first_token = self ._start_time
620
636
self ._complete_response = {"choices" : [], "model" : "" }
621
637
638
+ # Cleanup state tracking to prevent duplicate operations
639
+ self ._cleanup_completed = False
640
+ self ._cleanup_lock = threading .Lock ()
641
+
642
+ def __del__ (self ):
643
+ """Cleanup when object is garbage collected"""
644
+ if hasattr (self , '_cleanup_completed' ) and not self ._cleanup_completed :
645
+ self ._ensure_cleanup ()
646
+
622
647
def __enter__ (self ):
623
648
return self
624
649
625
650
def __exit__ (self , exc_type , exc_val , exc_tb ):
626
- self .__wrapped__ .__exit__ (exc_type , exc_val , exc_tb )
651
+ cleanup_exception = None
652
+ try :
653
+ self ._ensure_cleanup ()
654
+ except Exception as e :
655
+ cleanup_exception = e
656
+ # Don't re-raise to avoid masking original exception
657
+
658
+ result = self .__wrapped__ .__exit__ (exc_type , exc_val , exc_tb )
659
+
660
+ if cleanup_exception :
661
+ # Log cleanup exception but don't affect context manager behavior
662
+ logger .debug (
663
+ "Error during ChatStream cleanup in __exit__: %s" , cleanup_exception )
664
+
665
+ return result
627
666
628
667
async def __aenter__ (self ):
629
668
return self
@@ -643,6 +682,11 @@ def __next__(self):
643
682
except Exception as e :
644
683
if isinstance (e , StopIteration ):
645
684
self ._process_complete_response ()
685
+ else :
686
+ # Handle cleanup for other exceptions during stream iteration
687
+ self ._ensure_cleanup ()
688
+ if self ._span and self ._span .is_recording ():
689
+ self ._span .set_status (Status (StatusCode .ERROR , str (e )))
646
690
raise
647
691
else :
648
692
self ._process_item (chunk )
@@ -654,13 +698,19 @@ async def __anext__(self):
654
698
except Exception as e :
655
699
if isinstance (e , StopAsyncIteration ):
656
700
self ._process_complete_response ()
701
+ else :
702
+ # Handle cleanup for other exceptions during stream iteration
703
+ self ._ensure_cleanup ()
704
+ if self ._span and self ._span .is_recording ():
705
+ self ._span .set_status (Status (StatusCode .ERROR , str (e )))
657
706
raise
658
707
else :
659
708
self ._process_item (chunk )
660
709
return chunk
661
710
662
711
def _process_item (self , item ):
663
- self ._span .add_event (name = f"{ SpanAttributes .LLM_CONTENT_COMPLETION_CHUNK } " )
712
+ self ._span .add_event (
713
+ name = f"{ SpanAttributes .LLM_CONTENT_COMPLETION_CHUNK } " )
664
714
665
715
if self ._first_token and self ._streaming_time_to_first_token :
666
716
self ._time_of_first_token = time .time ()
@@ -721,10 +771,82 @@ def _process_complete_response(self):
721
771
emit_event (_parse_choice_event (choice ))
722
772
else :
723
773
if should_send_prompts ():
724
- _set_completions (self ._span , self ._complete_response .get ("choices" ))
774
+ _set_completions (
775
+ self ._span , self ._complete_response .get ("choices" ))
725
776
726
777
self ._span .set_status (Status (StatusCode .OK ))
727
778
self ._span .end ()
779
+ self ._cleanup_completed = True
780
+
781
+ @dont_throw
782
+ def _ensure_cleanup (self ):
783
+ """Thread-safe cleanup method that handles different cleanup scenarios"""
784
+ with self ._cleanup_lock :
785
+ if self ._cleanup_completed :
786
+ logger .debug ("ChatStream cleanup already completed, skipping" )
787
+ return
788
+
789
+ try :
790
+ logger .debug ("Starting ChatStream cleanup" )
791
+
792
+ # Calculate partial metrics based on available data
793
+ self ._record_partial_metrics ()
794
+
795
+ # Set span status and close it
796
+ if self ._span and self ._span .is_recording ():
797
+ self ._span .set_status (Status (StatusCode .OK ))
798
+ self ._span .end ()
799
+ logger .debug ("ChatStream span closed successfully" )
800
+
801
+ self ._cleanup_completed = True
802
+ logger .debug ("ChatStream cleanup completed successfully" )
803
+
804
+ except Exception as e :
805
+ # Log cleanup errors but don't propagate to avoid masking original issues
806
+ logger .debug ("Error during ChatStream cleanup: %s" , str (e ))
807
+
808
+ # Still try to close the span even if metrics recording failed
809
+ try :
810
+ if self ._span and self ._span .is_recording ():
811
+ self ._span .set_status (
812
+ Status (StatusCode .ERROR , "Cleanup failed" ))
813
+ self ._span .end ()
814
+ self ._cleanup_completed = True
815
+ except Exception :
816
+ # Final fallback - just mark as completed to prevent infinite loops
817
+ self ._cleanup_completed = True
818
+
819
+ @dont_throw
820
+ def _record_partial_metrics (self ):
821
+ """Record metrics based on available partial data"""
822
+ # Always record duration if we have start time
823
+ if self ._start_time and isinstance (self ._start_time , (float , int )) and self ._duration_histogram :
824
+ duration = time .time () - self ._start_time
825
+ self ._duration_histogram .record (
826
+ duration , attributes = self ._shared_attributes ()
827
+ )
828
+
829
+ # Record basic span attributes even without complete response
830
+ if self ._span and self ._span .is_recording ():
831
+ _set_response_attributes (self ._span , self ._complete_response )
832
+
833
+ # Record partial token metrics if we have any data
834
+ if self ._complete_response .get ("choices" ) or self ._request_kwargs :
835
+ _set_streaming_token_metrics (
836
+ self ._request_kwargs ,
837
+ self ._complete_response ,
838
+ self ._span ,
839
+ self ._token_counter ,
840
+ self ._shared_attributes (),
841
+ )
842
+
843
+ # Record choice metrics if we have any choices processed
844
+ if self ._choice_counter and self ._complete_response .get ("choices" ):
845
+ _set_choice_counter_metrics (
846
+ self ._choice_counter ,
847
+ self ._complete_response .get ("choices" ),
848
+ self ._shared_attributes (),
849
+ )
728
850
729
851
730
852
# Backward compatibility with OpenAI v0
@@ -755,7 +877,8 @@ def _build_from_streaming_response(
755
877
756
878
if first_token and streaming_time_to_first_token :
757
879
time_of_first_token = time .time ()
758
- streaming_time_to_first_token .record (time_of_first_token - start_time )
880
+ streaming_time_to_first_token .record (
881
+ time_of_first_token - start_time )
759
882
first_token = False
760
883
761
884
_accumulate_stream_items (item , complete_response )
@@ -825,7 +948,8 @@ async def _abuild_from_streaming_response(
825
948
826
949
if first_token and streaming_time_to_first_token :
827
950
time_of_first_token = time .time ()
828
- streaming_time_to_first_token .record (time_of_first_token - start_time )
951
+ streaming_time_to_first_token .record (
952
+ time_of_first_token - start_time )
829
953
first_token = False
830
954
831
955
_accumulate_stream_items (item , complete_response )
@@ -943,7 +1067,8 @@ def _(choice: dict) -> ChoiceEvent:
943
1067
944
1068
content = choice .get ("message" ).get ("content" , "" ) if has_message else None
945
1069
role = choice .get ("message" ).get ("role" ) if has_message else "unknown"
946
- finish_reason = choice .get ("finish_reason" ) if has_finish_reason else "unknown"
1070
+ finish_reason = choice .get (
1071
+ "finish_reason" ) if has_finish_reason else "unknown"
947
1072
948
1073
if has_tool_calls and has_function_call :
949
1074
tool_calls = message .get ("tool_calls" ) + [message .get ("function_call" )]
@@ -982,7 +1107,8 @@ def _accumulate_stream_items(item, complete_response):
982
1107
983
1108
# prompt filter results
984
1109
if item .get ("prompt_filter_results" ):
985
- complete_response ["prompt_filter_results" ] = item .get ("prompt_filter_results" )
1110
+ complete_response ["prompt_filter_results" ] = item .get (
1111
+ "prompt_filter_results" )
986
1112
987
1113
for choice in item .get ("choices" ):
988
1114
index = choice .get ("index" )
@@ -1029,4 +1155,5 @@ def _accumulate_stream_items(item, complete_response):
1029
1155
if tool_call_function and tool_call_function .get ("name" ):
1030
1156
span_function ["name" ] = tool_call_function .get ("name" )
1031
1157
if tool_call_function and tool_call_function .get ("arguments" ):
1032
- span_function ["arguments" ] += tool_call_function .get ("arguments" )
1158
+ span_function ["arguments" ] += tool_call_function .get (
1159
+ "arguments" )
0 commit comments