couchbase-examples
diff --git a/‎notebooks/flight_search_agent_langraph/data/queries.py
Lines changed: 19 additions & 9 deletions b/‎notebooks/flight_search_agent_langraph/data/queries.py
Lines changed: 19 additions & 9 deletions
diff --git a/‎notebooks/flight_search_agent_langraph/evals/eval_arize.py
Lines changed: 13 additions & 36 deletions b/‎notebooks/flight_search_agent_langraph/evals/eval_arize.py
Lines changed: 13 additions & 36 deletions
diff --git a/‎notebooks/flight_search_agent_langraph/flight_search_agent_tutorial.ipynb
Lines changed: 0 additions & 8 deletions b/‎notebooks/flight_search_agent_langraph/flight_search_agent_tutorial.ipynb
Lines changed: 0 additions & 8 deletions
diff --git a/‎notebooks/hotel_support_agent/hotel_support_agent_tutorial.ipynb
Lines changed: 4 additions & 8 deletions b/‎notebooks/hotel_support_agent/hotel_support_agent_tutorial.ipynb
Lines changed: 4 additions & 8 deletions
@@ -2,7 +2,7 @@
 Shared flight search queries for both evaluation and testing.
 """
 
-# Flight search queries for evaluation and testing
+# Flight search queries (for evaluation and testing)
 FLIGHT_SEARCH_QUERIES = [
     "Find flights from JFK to LAX",
     "Book a flight from LAX to JFK for tomorrow, 2 passengers, business class",
@@ -11,11 +11,13 @@
     "What do passengers say about SpiceJet's service quality?",
 ]
 
-# Reference answers based on actual system responses
-QUERY_REFERENCE_ANSWERS = {
-    "Find flights from JFK to LAX": """There are several flights available from JFK to LAX: AS flight using 321 762, B6 flight using 320, DL flight using 76W 752, QF flight using 744, AA flight using 32B 762, UA flight using 757, US flight using 32B 762, and VX flight using 320.""",
+# Comprehensive reference answers based on actual system responses
+FLIGHT_REFERENCE_ANSWERS = [
+    # Query 1: Flight search JFK to LAX
+    """There are several flights available from JFK to LAX: AS flight using 321 762, B6 flight using 320, DL flight using 76W 752, QF flight using 744, AA flight using 32B 762, UA flight using 757, US flight using 32B 762, and VX flight using 320.""",
 
-    "Book a flight from LAX to JFK for tomorrow, 2 passengers, business class": """Flight Booking Confirmed!
+    # Query 2: Flight booking LAX to JFK
+    """Flight Booking Confirmed!
 
 Booking ID: FL0718575D3BEE
 Route: LAX → JFK
@@ -31,7 +33,8 @@
 
 Thank you for choosing our airline!""",
 
-    "Book an economy flight from JFK to MIA for next week, 1 passenger": """Flight Booking Confirmed!
+    # Query 3: Flight booking JFK to MIA
+    """Flight Booking Confirmed!
 
 Booking ID: FL07248898793D
 Route: JFK → MIA
@@ -42,12 +45,13 @@
 
 Next Steps:
 1. Check-in opens 24 hours before departure
-2. Arrive at airport 2 hours early for domestic flights
+2. Arrive at airport 2 hours early for domestic flights  
 3. Bring valid government-issued photo ID
 
 Thank you for choosing our airline!""",
 
-    "Show me my current flight bookings": """Your current flight bookings are as follows:
+    # Query 4: Show current bookings
+    """Your current flight bookings are as follows:
 
 1. Booking ID: FL07248898793D
    - Route: JFK → MIA
@@ -67,7 +71,8 @@
    - Status: confirmed
    - Booked: 2025-07-17""",
 
-    "What do passengers say about SpiceJet's service quality?": """Found 5 relevant airline reviews for SpiceJet service quality:
+    # Query 5: SpiceJet reviews
+    """Found 5 relevant airline reviews for SpiceJet service quality:
 
 Review 1:
 Airline: SpiceJet. Title: "a pleasant journey". Review: ✅ Trip Verified |It was a pleasant journey on this SpiceJet flight. Air-hostess are so kind and helpful. Supported well for senior citizens with great hospitality. Thanks to SpiceJet team.. Rating: 10.0/10. Reviewer: Thyagaraju Palisetty. Date: 18th April 2024. Recommended: yes
@@ -83,6 +88,11 @@
 
 Review 5:
 Airline: SpiceJet. Title: SpiceJet customer review. Review: Flight to Kolkata with Spicejet and return back to Delhi was the best. Comfortable and fast option. For my next flight to Kolkata I will for sure choose this Airline.. Rating: 6.0/10. Reviewer: R Martin. Date: 21st April 2019. Recommended: yes""",
+]
+
+# Create dictionary for backward compatibility
+QUERY_REFERENCE_ANSWERS = {
+    query: answer for query, answer in zip(FLIGHT_SEARCH_QUERIES, FLIGHT_REFERENCE_ANSWERS)
 }
 
 def get_test_queries():
 
@@ -670,48 +670,25 @@ def _log_evaluation_summary(self, results_df: pd.DataFrame) -> None:
 
                 logger.info(f"   Query {i+1}: {' | '.join(scores)}")
 
-        # Sample results
+        # Sample results with FULL detailed explanations for debugging
         if len(results_df) > 0:
-            logger.info("\n📝 Detailed evaluation results:")
-            # Show all results, not just 2
-            for i in range(len(results_df)):
+            logger.info("\n📝 DETAILED EVALUATION RESULTS (FULL EXPLANATIONS):")
+            logger.info("="*80)
+            for i in range(min(len(results_df), len(results_df))):  # Show all results
                 row = results_df.iloc[i]
-                logger.info(f"\n   📋 Query {i+1}: {row['query']}")
+                logger.info(f"\n🔍 QUERY {i+1}: {row['query']}")
+                logger.info("-"*60)
 
                 for eval_type in ["relevance", "qa_correctness", "hallucination", "toxicity"]:
                     if eval_type in row:
                         result = row[eval_type]
-                        explanation = str(row.get(f"{eval_type}_explanation", ""))
-                        
-                        # Clean up explanations - remove reference text mentions and make more concise
-                        if explanation and explanation != "":
-                            # Clean up the explanation text
-                            explanation = explanation.replace("The reference text", "The expected answer")
-                            explanation = explanation.replace("reference text", "expected answer")
-                            explanation = explanation.replace("The question asks", "This query asks")
-                            explanation = explanation.replace("To determine if the answer", "The answer")
-                            explanation = explanation.replace("To determine whether the text", "This text")
-                            explanation = explanation.replace("Therefore, the reference text contains relevant information", "This is relevant")
-                            explanation = explanation.replace("Therefore, the answer", "The response")
-                            
-                            # Extract key reasoning points and make more concise
-                            if len(explanation) > 300:
-                                # Find the core reasoning
-                                sentences = explanation.split('. ')
-                                core_sentences = []
-                                for sentence in sentences:
-                                    if any(keyword in sentence.lower() for keyword in ['correct', 'factual', 'relevant', 'toxic', 'because', 'therefore', 'accurate', 'match']):
-                                        core_sentences.append(sentence)
-                                if core_sentences:
-                                    explanation = '. '.join(core_sentences[:3]) + '.'
-                                else:
-                                    explanation = '. '.join(sentences[:2]) + '.'
-                            
-                            logger.info(f"   ✅ {eval_type.title().replace('_', ' ')}: {result}")
-                            logger.info(f"      💭 {explanation}")
-                        else:
-                            logger.info(f"   ✅ {eval_type.title().replace('_', ' ')}: {result}")
-                logger.info("   " + "="*50)
+                        # Show FULL explanation instead of processed/truncated version
+                        full_explanation = str(row.get(f"{eval_type}_explanation", "No explanation provided"))
+                        logger.info(f"\n📊 {eval_type.upper()}: {result}")
+                        logger.info(f"💭 FULL REASONING:")
+                        logger.info(f"{full_explanation}")
+                        logger.info("-"*40)
+                logger.info("="*80)
 
     def cleanup(self) -> None:
         """Clean up all resources."""
 
@@ -1261,8 +1261,6 @@
         "            query = flight_eval_data[i][\"input\"]\n",
         "            logger.info(f\"   Query: {query}\")\n",
         "            logger.info(f\"   Relevance: {result.label}\")\n",
-        "            if hasattr(result, 'explanation') and result.explanation:\n",
-        "                logger.info(f\"   Explanation: {result.explanation}\")\n",
         "            logger.info(\"   \" + \"-\"*30)\n",
         "        \n",
         "        # 2. QA Evaluation\n",
@@ -1279,8 +1277,6 @@
         "            query = flight_eval_data[i][\"input\"]\n",
         "            logger.info(f\"   Query: {query}\")\n",
         "            logger.info(f\"   QA Score: {result.label}\")\n",
-        "            if hasattr(result, 'explanation') and result.explanation:\n",
-        "                logger.info(f\"   Explanation: {result.explanation}\")\n",
         "            logger.info(\"   \" + \"-\"*30)\n",
         "        \n",
         "        # 3. Hallucination Evaluation\n",
@@ -1297,8 +1293,6 @@
         "            query = flight_eval_data[i][\"input\"]\n",
         "            logger.info(f\"   Query: {query}\")\n",
         "            logger.info(f\"   Hallucination: {result.label}\")\n",
-        "            if hasattr(result, 'explanation') and result.explanation:\n",
-        "                logger.info(f\"   Explanation: {result.explanation}\")\n",
         "            logger.info(\"   \" + \"-\"*30)\n",
         "        \n",
         "        # 4. Toxicity Evaluation\n",
@@ -1315,8 +1309,6 @@
         "            query = flight_eval_data[i][\"input\"]\n",
         "            logger.info(f\"   Query: {query}\")\n",
         "            logger.info(f\"   Toxicity: {result.label}\")\n",
-        "            if hasattr(result, 'explanation') and result.explanation:\n",
-        "                logger.info(f\"   Explanation: {result.explanation}\")\n",
         "            logger.info(\"   \" + \"-\"*30)\n",
         "        \n",
         "        # Summary of all evaluations\n",
 
@@ -1199,11 +1199,10 @@
         "            relevance_labels = hotel_relevance_results[\"label\"].tolist() if \"label\" in hotel_relevance_results.columns else [\"unknown\"] * len(hotel_eval_data)\n",
         "            relevance_explanations = hotel_relevance_results[\"explanation\"].tolist() if \"explanation\" in hotel_relevance_results.columns else [\"No explanation\"] * len(hotel_eval_data)\n",
         "            \n",
-        "            for i, (label, explanation) in enumerate(zip(relevance_labels, relevance_explanations)):\n",
+        "            for i, label in enumerate(relevance_labels):\n",
         "                query = hotel_eval_data[i][\"input\"]\n",
         "                logger.info(f\"   Query: {query}\")\n",
         "                logger.info(f\"   Relevance: {label}\")\n",
-        "                logger.info(f\"   Explanation: {explanation}\")\n",
         "                logger.info(\"   \" + \"-\"*30)\n",
         "        \n",
         "        # 2. QA Evaluation\n",
@@ -1221,11 +1220,10 @@
         "            qa_labels = hotel_qa_results[\"label\"].tolist() if \"label\" in hotel_qa_results.columns else [\"unknown\"] * len(hotel_eval_data)\n",
         "            qa_explanations = hotel_qa_results[\"explanation\"].tolist() if \"explanation\" in hotel_qa_results.columns else [\"No explanation\"] * len(hotel_eval_data)\n",
         "            \n",
-        "            for i, (label, explanation) in enumerate(zip(qa_labels, qa_explanations)):\n",
+        "            for i, label in enumerate(qa_labels):\n",
         "                query = hotel_eval_data[i][\"input\"]\n",
         "                logger.info(f\"   Query: {query}\")\n",
         "                logger.info(f\"   QA Score: {label}\")\n",
-        "                logger.info(f\"   Explanation: {explanation}\")\n",
         "                logger.info(\"   \" + \"-\"*30)\n",
         "        \n",
         "        # 3. Hallucination Evaluation\n",
@@ -1243,11 +1241,10 @@
         "            hallucination_labels = hotel_hallucination_results[\"label\"].tolist() if \"label\" in hotel_hallucination_results.columns else [\"unknown\"] * len(hotel_eval_data)\n",
         "            hallucination_explanations = hotel_hallucination_results[\"explanation\"].tolist() if \"explanation\" in hotel_hallucination_results.columns else [\"No explanation\"] * len(hotel_eval_data)\n",
         "            \n",
-        "            for i, (label, explanation) in enumerate(zip(hallucination_labels, hallucination_explanations)):\n",
+        "            for i, label in enumerate(hallucination_labels):\n",
         "                query = hotel_eval_data[i][\"input\"]\n",
         "                logger.info(f\"   Query: {query}\")\n",
         "                logger.info(f\"   Hallucination: {label}\")\n",
-        "                logger.info(f\"   Explanation: {explanation}\")\n",
         "                logger.info(\"   \" + \"-\"*30)\n",
         "        \n",
         "        # 4. Toxicity Evaluation\n",
@@ -1265,11 +1262,10 @@
         "            toxicity_labels = hotel_toxicity_results[\"label\"].tolist() if \"label\" in hotel_toxicity_results.columns else [\"unknown\"] * len(hotel_eval_data)\n",
         "            toxicity_explanations = hotel_toxicity_results[\"explanation\"].tolist() if \"explanation\" in hotel_toxicity_results.columns else [\"No explanation\"] * len(hotel_eval_data)\n",
         "            \n",
-        "            for i, (label, explanation) in enumerate(zip(toxicity_labels, toxicity_explanations)):\n",
+        "            for i, label in enumerate(toxicity_labels):\n",
         "                query = hotel_eval_data[i][\"input\"]\n",
         "                logger.info(f\"   Query: {query}\")\n",
         "                logger.info(f\"   Toxicity: {label}\")\n",
-        "                logger.info(f\"   Explanation: {explanation}\")\n",
         "                logger.info(\"   \" + \"-\"*30)\n",
         "        \n",
         "        # Summary of all evaluations\n",