ScrapeGraphAI
diff --git a/‎scrapegraph-py/README.md
Lines changed: 91 additions & 0 deletions b/‎scrapegraph-py/README.md
Lines changed: 91 additions & 0 deletions
diff --git a/‎scrapegraph-py/examples/async/async_crawl_markdown_example.py
Lines changed: 214 additions & 0 deletions b/‎scrapegraph-py/examples/async/async_crawl_markdown_example.py
Lines changed: 214 additions & 0 deletions
@@ -21,6 +21,8 @@ pip install scrapegraph-py
 ## 🚀 Features
 
 - 🤖 AI-powered web scraping and search
+- 🕷️ Smart crawling with both AI extraction and markdown conversion modes
+- 💰 Cost-effective markdown conversion (80% savings vs AI mode)
 - 🔄 Both sync and async clients
 - 📊 Structured output with Pydantic schemas
 - 🔍 Detailed logging
@@ -219,6 +221,95 @@ response = client.markdownify(
 print(response)
 ```
 
+### 🕷️ Crawler
+
+Intelligently crawl and extract data from multiple pages with support for both AI extraction and markdown conversion modes.
+
+#### AI Extraction Mode (Default)
+Extract structured data from multiple pages using AI:
+
+```python
+from scrapegraph_py import Client
+
+client = Client(api_key="your-api-key-here")
+
+# Define the data schema for extraction
+schema = {
+    "type": "object",
+    "properties": {
+        "company_name": {"type": "string"},
+        "founders": {
+            "type": "array",
+            "items": {"type": "string"}
+        },
+        "description": {"type": "string"}
+    }
+}
+
+response = client.crawl(
+    url="https://scrapegraphai.com",
+    prompt="extract the company information and founders",
+    data_schema=schema,
+    depth=2,
+    max_pages=5,
+    same_domain_only=True
+)
+
+# Poll for results (crawl is asynchronous)
+crawl_id = response.get("crawl_id")
+result = client.get_crawl(crawl_id)
+```
+
+#### Markdown Conversion Mode (Cost-Effective)
+Convert pages to clean markdown without AI processing (80% cheaper):
+
+```python
+from scrapegraph_py import Client
+
+client = Client(api_key="your-api-key-here")
+
+response = client.crawl(
+    url="https://scrapegraphai.com",
+    extraction_mode=False,  # Markdown conversion mode
+    depth=2,
+    max_pages=5,
+    same_domain_only=True,
+    sitemap=True  # Use sitemap for better page discovery
+)
+
+# Poll for results
+crawl_id = response.get("crawl_id")
+result = client.get_crawl(crawl_id)
+
+# Access markdown content
+for page in result["result"]["pages"]:
+    print(f"URL: {page['url']}")
+    print(f"Markdown: {page['markdown']}")
+    print(f"Metadata: {page['metadata']}")
+```
+
+<details>
+<summary>🔧 Crawl Parameters</summary>
+
+- **url** (required): Starting URL for the crawl
+- **extraction_mode** (default: True): 
+  - `True` = AI extraction mode (requires prompt and data_schema)
+  - `False` = Markdown conversion mode (no AI, 80% cheaper)
+- **prompt** (required for AI mode): AI prompt to guide data extraction
+- **data_schema** (required for AI mode): JSON schema defining extracted data structure
+- **depth** (default: 2): Maximum crawl depth (1-10)
+- **max_pages** (default: 2): Maximum pages to crawl (1-100)
+- **same_domain_only** (default: True): Only crawl pages from the same domain
+- **sitemap** (default: False): Use sitemap for better page discovery
+- **cache_website** (default: True): Cache website content
+- **batch_size** (optional): Batch size for processing pages (1-10)
+
+**Cost Comparison:**
+- AI Extraction Mode: ~10 credits per page
+- Markdown Conversion Mode: ~2 credits per page (80% savings!)
+
+</details>
+
 ## ⚡ Async Support
 
 All endpoints support async operations:
 
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+"""
+Async example demonstrating the ScrapeGraphAI Crawler markdown conversion mode.
+
+This example shows how to use the async crawler in markdown conversion mode:
+- Cost-effective markdown conversion (NO AI/LLM processing)
+- 2 credits per page (80% savings compared to AI mode)
+- Clean HTML to markdown conversion with metadata extraction
+
+Requirements:
+- Python 3.7+
+- scrapegraph-py
+- aiohttp (installed with scrapegraph-py)
+- A valid API key
+
+Usage:
+    python async_crawl_markdown_example.py
+"""
+
+import asyncio
+import os
+from typing import Dict, Any
+
+from scrapegraph_py import AsyncClient
+
+
+async def poll_for_result(client: AsyncClient, crawl_id: str, max_attempts: int = 20) -> Dict[str, Any]:
+    """
+    Poll for crawl results with intelligent backoff to avoid rate limits.
+    
+    Args:
+        client: The async ScrapeGraph client
+        crawl_id: The crawl ID to poll for
+        max_attempts: Maximum number of polling attempts
+        
+    Returns:
+        The final result or raises an exception on timeout/failure
+    """
+    print("⏳ Starting to poll for results with rate-limit protection...")
+    
+    # Initial wait to give the job time to start processing
+    await asyncio.sleep(15)
+    
+    for attempt in range(max_attempts):
+        try:
+            result = await client.get_crawl(crawl_id)
+            status = result.get("status")
+            
+            if status == "success":
+                return result
+            elif status == "failed":
+                raise Exception(f"Crawl failed: {result.get('error', 'Unknown error')}")
+            else:
+                # Calculate progressive wait time: start at 15s, increase gradually
+                base_wait = 15
+                progressive_wait = min(60, base_wait + (attempt * 3))  # Cap at 60s
+                
+                print(f"⏳ Status: {status} (attempt {attempt + 1}/{max_attempts}) - waiting {progressive_wait}s...")
+                await asyncio.sleep(progressive_wait)
+                
+        except Exception as e:
+            if "rate" in str(e).lower() or "429" in str(e):
+                wait_time = min(90, 45 + (attempt * 10))
+                print(f"⚠️ Rate limit detected in error, waiting {wait_time}s...")
+                await asyncio.sleep(wait_time)
+                continue
+            else:
+                print(f"❌ Error polling for results: {e}")
+                if attempt < max_attempts - 1:
+                    await asyncio.sleep(20)  # Wait before retry
+                    continue
+                raise
+    
+    raise Exception(f"⏰ Timeout: Job did not complete after {max_attempts} attempts")
+
+
+async def markdown_crawling_example():
+    """
+    Markdown Conversion Mode (NO AI/LLM Used)
+    
+    This example demonstrates cost-effective crawling that converts pages to clean markdown
+    WITHOUT any AI processing. Perfect for content archival and when you only need clean markdown.
+    """
+    print("=" * 60)
+    print("ASYNC MARKDOWN CONVERSION MODE (NO AI/LLM)")
+    print("=" * 60)
+    print("Use case: Get clean markdown content without AI processing")
+    print("Cost: 2 credits per page (80% savings!)")
+    print("Features: Clean markdown conversion, metadata extraction")
+    print("⚠️ NO AI/LLM PROCESSING - Pure HTML to markdown conversion only!")
+    print()
+    
+    # Initialize the async client
+    client = AsyncClient.from_env()
+    
+    # Target URL for markdown conversion
+    url = "https://scrapegraphai.com/"
+    
+    print(f"🌐 Target URL: {url}")
+    print("🤖 AI Prompt: None (no AI processing)")
+    print("📊 Crawl Depth: 2")
+    print("📄 Max Pages: 2") 
+    print("🗺️ Use Sitemap: False")
+    print("💡 Mode: Pure HTML to markdown conversion")
+    print()
+    
+    # Start the markdown conversion job
+    print("🚀 Starting markdown conversion job...")
+    
+    # Call crawl with extraction_mode=False for markdown conversion
+    response = await client.crawl(
+        url=url,
+        extraction_mode=False,  # FALSE = Markdown conversion mode (NO AI/LLM used)
+        depth=2,
+        max_pages=2,
+        same_domain_only=True,
+        sitemap=False,  # Use sitemap for better coverage
+        # Note: No prompt or data_schema needed when extraction_mode=False
+    )
+    
+    crawl_id = response.get("crawl_id") or response.get("task_id")
+    
+    if not crawl_id:
+        print("❌ Failed to start markdown conversion job")
+        return
+    
+    print(f"📋 Crawl ID: {crawl_id}")
+    print("⏳ Polling for results...")
+    print()
+    
+    # Poll for results with rate-limit protection
+    try:
+        result = await poll_for_result(client, crawl_id, max_attempts=20)
+        
+        print("✅ Markdown conversion completed successfully!")
+        print()
+        
+        result_data = result.get("result", {})
+        pages = result_data.get("pages", [])
+        crawled_urls = result_data.get("crawled_urls", [])
+        credits_used = result_data.get("credits_used", 0)
+        pages_processed = result_data.get("pages_processed", 0)
+        
+        print("📊 CONVERSION RESULTS:")
+        print("-" * 40)
+        print(f"📄 Pages processed: {pages_processed}")
+        print(f"💰 Credits used: {credits_used}")
+        print(f"💵 Cost per page: {credits_used/pages_processed if pages_processed > 0 else 0:.1f} credits")
+        if crawled_urls:
+            print(f"🔗 URLs processed: {crawled_urls}")
+        print()
+        
+        print("📝 MARKDOWN CONTENT:")
+        print("-" * 40)
+        if pages:
+            print(f"📄 Total pages with markdown: {len(pages)}")
+            for i, page in enumerate(pages[:3]):  # Show first 3 pages
+                print(f"\n📄 Page {i+1}:")
+                print(f"   URL: {page.get('url')}")
+                print(f"   Title: {page.get('title')}")
+                
+                metadata = page.get("metadata", {})
+                print(f"   📊 Word count: {metadata.get('word_count', 0)}")
+                print(f"   📋 Headers: {metadata.get('headers', [])[:3]}")  # First 3 headers
+                print(f"   🔗 Links: {metadata.get('links_count', 0)}")
+                
+                # Show markdown preview
+                markdown_content = page.get("markdown", "")
+                markdown_preview = markdown_content[:200]
+                if len(markdown_content) > 200:
+                    markdown_preview += "..."
+                print(f"   📝 Content preview: {markdown_preview}")
+            
+            if len(pages) > 3:
+                print(f"\n   ... and {len(pages) - 3} more pages with markdown content")
+        else:
+            print("No markdown content available")
+    
+    except Exception as e:
+        print(f"❌ Markdown conversion failed: {str(e)}")
+
+
+async def main():
+    """Run the async markdown crawling example."""
+    print("🌐 ScrapeGraphAI Async Crawler - Markdown Conversion Example")
+    print("Cost-effective HTML to Markdown conversion (NO AI/LLM)")
+    print("=" * 60)
+    
+    # Check if API key is set
+    api_key = os.getenv("SGAI_API_KEY")
+    if not api_key:
+        print("⚠️ Please set your API key in the environment variable SGAI_API_KEY")
+        print("   export SGAI_API_KEY=your_api_key_here")
+        print()
+        print("   You can get your API key from: https://dashboard.scrapegraphai.com")
+        return
+    
+    print(f"🔑 Using API key: {api_key[:10]}...")
+    print()
+    
+    # Run the markdown conversion example
+    await markdown_crawling_example()
+    
+    print("\n" + "=" * 60)
+    print("🎉 Example completed!")
+    print("💡 This demonstrates async markdown conversion mode:")
+    print("   • Cost-effective: Only 2 credits per page")
+    print("   • No AI/LLM processing - pure HTML to markdown conversion")
+    print("   • Perfect for content archival and documentation")
+    print("   • 80% cheaper than AI extraction modes!")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())