Skip to content

Commit b3854bd

Browse files
committed
no and full explain in evals and notebook respectively + query and reference answers in 2 arrays
1 parent 03c650e commit b3854bd

File tree

7 files changed

+71
-86
lines changed

7 files changed

+71
-86
lines changed

notebooks/flight_search_agent_langraph/data/queries.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Shared flight search queries for both evaluation and testing.
33
"""
44

5-
# Flight search queries for evaluation and testing
5+
# Flight search queries (for evaluation and testing)
66
FLIGHT_SEARCH_QUERIES = [
77
"Find flights from JFK to LAX",
88
"Book a flight from LAX to JFK for tomorrow, 2 passengers, business class",
@@ -11,11 +11,13 @@
1111
"What do passengers say about SpiceJet's service quality?",
1212
]
1313

14-
# Reference answers based on actual system responses
15-
QUERY_REFERENCE_ANSWERS = {
16-
"Find flights from JFK to LAX": """There are several flights available from JFK to LAX: AS flight using 321 762, B6 flight using 320, DL flight using 76W 752, QF flight using 744, AA flight using 32B 762, UA flight using 757, US flight using 32B 762, and VX flight using 320.""",
14+
# Comprehensive reference answers based on actual system responses
15+
FLIGHT_REFERENCE_ANSWERS = [
16+
# Query 1: Flight search JFK to LAX
17+
"""There are several flights available from JFK to LAX: AS flight using 321 762, B6 flight using 320, DL flight using 76W 752, QF flight using 744, AA flight using 32B 762, UA flight using 757, US flight using 32B 762, and VX flight using 320.""",
1718

18-
"Book a flight from LAX to JFK for tomorrow, 2 passengers, business class": """Flight Booking Confirmed!
19+
# Query 2: Flight booking LAX to JFK
20+
"""Flight Booking Confirmed!
1921
2022
Booking ID: FL0718575D3BEE
2123
Route: LAX → JFK
@@ -31,7 +33,8 @@
3133
3234
Thank you for choosing our airline!""",
3335

34-
"Book an economy flight from JFK to MIA for next week, 1 passenger": """Flight Booking Confirmed!
36+
# Query 3: Flight booking JFK to MIA
37+
"""Flight Booking Confirmed!
3538
3639
Booking ID: FL07248898793D
3740
Route: JFK → MIA
@@ -42,12 +45,13 @@
4245
4346
Next Steps:
4447
1. Check-in opens 24 hours before departure
45-
2. Arrive at airport 2 hours early for domestic flights
48+
2. Arrive at airport 2 hours early for domestic flights
4649
3. Bring valid government-issued photo ID
4750
4851
Thank you for choosing our airline!""",
4952

50-
"Show me my current flight bookings": """Your current flight bookings are as follows:
53+
# Query 4: Show current bookings
54+
"""Your current flight bookings are as follows:
5155
5256
1. Booking ID: FL07248898793D
5357
- Route: JFK → MIA
@@ -67,7 +71,8 @@
6771
- Status: confirmed
6872
- Booked: 2025-07-17""",
6973

70-
"What do passengers say about SpiceJet's service quality?": """Found 5 relevant airline reviews for SpiceJet service quality:
74+
# Query 5: SpiceJet reviews
75+
"""Found 5 relevant airline reviews for SpiceJet service quality:
7176
7277
Review 1:
7378
Airline: SpiceJet. Title: "a pleasant journey". Review: ✅ Trip Verified |It was a pleasant journey on this SpiceJet flight. Air-hostess are so kind and helpful. Supported well for senior citizens with great hospitality. Thanks to SpiceJet team.. Rating: 10.0/10. Reviewer: Thyagaraju Palisetty. Date: 18th April 2024. Recommended: yes
@@ -83,6 +88,11 @@
8388
8489
Review 5:
8590
Airline: SpiceJet. Title: SpiceJet customer review. Review: Flight to Kolkata with Spicejet and return back to Delhi was the best. Comfortable and fast option. For my next flight to Kolkata I will for sure choose this Airline.. Rating: 6.0/10. Reviewer: R Martin. Date: 21st April 2019. Recommended: yes""",
91+
]
92+
93+
# Create dictionary for backward compatibility
94+
QUERY_REFERENCE_ANSWERS = {
95+
query: answer for query, answer in zip(FLIGHT_SEARCH_QUERIES, FLIGHT_REFERENCE_ANSWERS)
8696
}
8797

8898
def get_test_queries():

notebooks/flight_search_agent_langraph/evals/eval_arize.py

Lines changed: 13 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -670,48 +670,25 @@ def _log_evaluation_summary(self, results_df: pd.DataFrame) -> None:
670670

671671
logger.info(f" Query {i+1}: {' | '.join(scores)}")
672672

673-
# Sample results
673+
# Sample results with FULL detailed explanations for debugging
674674
if len(results_df) > 0:
675-
logger.info("\n📝 Detailed evaluation results:")
676-
# Show all results, not just 2
677-
for i in range(len(results_df)):
675+
logger.info("\n📝 DETAILED EVALUATION RESULTS (FULL EXPLANATIONS):")
676+
logger.info("="*80)
677+
for i in range(min(len(results_df), len(results_df))): # Show all results
678678
row = results_df.iloc[i]
679-
logger.info(f"\n 📋 Query {i+1}: {row['query']}")
679+
logger.info(f"\n🔍 QUERY {i+1}: {row['query']}")
680+
logger.info("-"*60)
680681

681682
for eval_type in ["relevance", "qa_correctness", "hallucination", "toxicity"]:
682683
if eval_type in row:
683684
result = row[eval_type]
684-
explanation = str(row.get(f"{eval_type}_explanation", ""))
685-
686-
# Clean up explanations - remove reference text mentions and make more concise
687-
if explanation and explanation != "":
688-
# Clean up the explanation text
689-
explanation = explanation.replace("The reference text", "The expected answer")
690-
explanation = explanation.replace("reference text", "expected answer")
691-
explanation = explanation.replace("The question asks", "This query asks")
692-
explanation = explanation.replace("To determine if the answer", "The answer")
693-
explanation = explanation.replace("To determine whether the text", "This text")
694-
explanation = explanation.replace("Therefore, the reference text contains relevant information", "This is relevant")
695-
explanation = explanation.replace("Therefore, the answer", "The response")
696-
697-
# Extract key reasoning points and make more concise
698-
if len(explanation) > 300:
699-
# Find the core reasoning
700-
sentences = explanation.split('. ')
701-
core_sentences = []
702-
for sentence in sentences:
703-
if any(keyword in sentence.lower() for keyword in ['correct', 'factual', 'relevant', 'toxic', 'because', 'therefore', 'accurate', 'match']):
704-
core_sentences.append(sentence)
705-
if core_sentences:
706-
explanation = '. '.join(core_sentences[:3]) + '.'
707-
else:
708-
explanation = '. '.join(sentences[:2]) + '.'
709-
710-
logger.info(f" ✅ {eval_type.title().replace('_', ' ')}: {result}")
711-
logger.info(f" 💭 {explanation}")
712-
else:
713-
logger.info(f" ✅ {eval_type.title().replace('_', ' ')}: {result}")
714-
logger.info(" " + "="*50)
685+
# Show FULL explanation instead of processed/truncated version
686+
full_explanation = str(row.get(f"{eval_type}_explanation", "No explanation provided"))
687+
logger.info(f"\n📊 {eval_type.upper()}: {result}")
688+
logger.info(f"💭 FULL REASONING:")
689+
logger.info(f"{full_explanation}")
690+
logger.info("-"*40)
691+
logger.info("="*80)
715692

716693
def cleanup(self) -> None:
717694
"""Clean up all resources."""

notebooks/flight_search_agent_langraph/flight_search_agent_tutorial.ipynb

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1261,8 +1261,6 @@
12611261
" query = flight_eval_data[i][\"input\"]\n",
12621262
" logger.info(f\" Query: {query}\")\n",
12631263
" logger.info(f\" Relevance: {result.label}\")\n",
1264-
" if hasattr(result, 'explanation') and result.explanation:\n",
1265-
" logger.info(f\" Explanation: {result.explanation}\")\n",
12661264
" logger.info(\" \" + \"-\"*30)\n",
12671265
" \n",
12681266
" # 2. QA Evaluation\n",
@@ -1279,8 +1277,6 @@
12791277
" query = flight_eval_data[i][\"input\"]\n",
12801278
" logger.info(f\" Query: {query}\")\n",
12811279
" logger.info(f\" QA Score: {result.label}\")\n",
1282-
" if hasattr(result, 'explanation') and result.explanation:\n",
1283-
" logger.info(f\" Explanation: {result.explanation}\")\n",
12841280
" logger.info(\" \" + \"-\"*30)\n",
12851281
" \n",
12861282
" # 3. Hallucination Evaluation\n",
@@ -1297,8 +1293,6 @@
12971293
" query = flight_eval_data[i][\"input\"]\n",
12981294
" logger.info(f\" Query: {query}\")\n",
12991295
" logger.info(f\" Hallucination: {result.label}\")\n",
1300-
" if hasattr(result, 'explanation') and result.explanation:\n",
1301-
" logger.info(f\" Explanation: {result.explanation}\")\n",
13021296
" logger.info(\" \" + \"-\"*30)\n",
13031297
" \n",
13041298
" # 4. Toxicity Evaluation\n",
@@ -1315,8 +1309,6 @@
13151309
" query = flight_eval_data[i][\"input\"]\n",
13161310
" logger.info(f\" Query: {query}\")\n",
13171311
" logger.info(f\" Toxicity: {result.label}\")\n",
1318-
" if hasattr(result, 'explanation') and result.explanation:\n",
1319-
" logger.info(f\" Explanation: {result.explanation}\")\n",
13201312
" logger.info(\" \" + \"-\"*30)\n",
13211313
" \n",
13221314
" # Summary of all evaluations\n",

notebooks/hotel_support_agent/hotel_support_agent_tutorial.ipynb

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1199,11 +1199,10 @@
11991199
" relevance_labels = hotel_relevance_results[\"label\"].tolist() if \"label\" in hotel_relevance_results.columns else [\"unknown\"] * len(hotel_eval_data)\n",
12001200
" relevance_explanations = hotel_relevance_results[\"explanation\"].tolist() if \"explanation\" in hotel_relevance_results.columns else [\"No explanation\"] * len(hotel_eval_data)\n",
12011201
" \n",
1202-
" for i, (label, explanation) in enumerate(zip(relevance_labels, relevance_explanations)):\n",
1202+
" for i, label in enumerate(relevance_labels):\n",
12031203
" query = hotel_eval_data[i][\"input\"]\n",
12041204
" logger.info(f\" Query: {query}\")\n",
12051205
" logger.info(f\" Relevance: {label}\")\n",
1206-
" logger.info(f\" Explanation: {explanation}\")\n",
12071206
" logger.info(\" \" + \"-\"*30)\n",
12081207
" \n",
12091208
" # 2. QA Evaluation\n",
@@ -1221,11 +1220,10 @@
12211220
" qa_labels = hotel_qa_results[\"label\"].tolist() if \"label\" in hotel_qa_results.columns else [\"unknown\"] * len(hotel_eval_data)\n",
12221221
" qa_explanations = hotel_qa_results[\"explanation\"].tolist() if \"explanation\" in hotel_qa_results.columns else [\"No explanation\"] * len(hotel_eval_data)\n",
12231222
" \n",
1224-
" for i, (label, explanation) in enumerate(zip(qa_labels, qa_explanations)):\n",
1223+
" for i, label in enumerate(qa_labels):\n",
12251224
" query = hotel_eval_data[i][\"input\"]\n",
12261225
" logger.info(f\" Query: {query}\")\n",
12271226
" logger.info(f\" QA Score: {label}\")\n",
1228-
" logger.info(f\" Explanation: {explanation}\")\n",
12291227
" logger.info(\" \" + \"-\"*30)\n",
12301228
" \n",
12311229
" # 3. Hallucination Evaluation\n",
@@ -1243,11 +1241,10 @@
12431241
" hallucination_labels = hotel_hallucination_results[\"label\"].tolist() if \"label\" in hotel_hallucination_results.columns else [\"unknown\"] * len(hotel_eval_data)\n",
12441242
" hallucination_explanations = hotel_hallucination_results[\"explanation\"].tolist() if \"explanation\" in hotel_hallucination_results.columns else [\"No explanation\"] * len(hotel_eval_data)\n",
12451243
" \n",
1246-
" for i, (label, explanation) in enumerate(zip(hallucination_labels, hallucination_explanations)):\n",
1244+
" for i, label in enumerate(hallucination_labels):\n",
12471245
" query = hotel_eval_data[i][\"input\"]\n",
12481246
" logger.info(f\" Query: {query}\")\n",
12491247
" logger.info(f\" Hallucination: {label}\")\n",
1250-
" logger.info(f\" Explanation: {explanation}\")\n",
12511248
" logger.info(\" \" + \"-\"*30)\n",
12521249
" \n",
12531250
" # 4. Toxicity Evaluation\n",
@@ -1265,11 +1262,10 @@
12651262
" toxicity_labels = hotel_toxicity_results[\"label\"].tolist() if \"label\" in hotel_toxicity_results.columns else [\"unknown\"] * len(hotel_eval_data)\n",
12661263
" toxicity_explanations = hotel_toxicity_results[\"explanation\"].tolist() if \"explanation\" in hotel_toxicity_results.columns else [\"No explanation\"] * len(hotel_eval_data)\n",
12671264
" \n",
1268-
" for i, (label, explanation) in enumerate(zip(toxicity_labels, toxicity_explanations)):\n",
1265+
" for i, label in enumerate(toxicity_labels):\n",
12691266
" query = hotel_eval_data[i][\"input\"]\n",
12701267
" logger.info(f\" Query: {query}\")\n",
12711268
" logger.info(f\" Toxicity: {label}\")\n",
1272-
" logger.info(f\" Explanation: {explanation}\")\n",
12731269
" logger.info(\" \" + \"-\"*30)\n",
12741270
" \n",
12751271
" # Summary of all evaluations\n",

0 commit comments

Comments
 (0)