1
+ #!/usr/bin/env python3
2
+ """
3
+ Async example demonstrating the ScrapeGraphAI Crawler markdown conversion mode.
4
+
5
+ This example shows how to use the async crawler in markdown conversion mode:
6
+ - Cost-effective markdown conversion (NO AI/LLM processing)
7
+ - 2 credits per page (80% savings compared to AI mode)
8
+ - Clean HTML to markdown conversion with metadata extraction
9
+
10
+ Requirements:
11
+ - Python 3.7+
12
+ - scrapegraph-py
13
+ - aiohttp (installed with scrapegraph-py)
14
+ - A valid API key
15
+
16
+ Usage:
17
+ python async_crawl_markdown_example.py
18
+ """
19
+
20
+ import asyncio
21
+ import os
22
+ from typing import Dict , Any
23
+
24
+ from scrapegraph_py import AsyncClient
25
+
26
+
27
+ async def poll_for_result (client : AsyncClient , crawl_id : str , max_attempts : int = 20 ) -> Dict [str , Any ]:
28
+ """
29
+ Poll for crawl results with intelligent backoff to avoid rate limits.
30
+
31
+ Args:
32
+ client: The async ScrapeGraph client
33
+ crawl_id: The crawl ID to poll for
34
+ max_attempts: Maximum number of polling attempts
35
+
36
+ Returns:
37
+ The final result or raises an exception on timeout/failure
38
+ """
39
+ print ("⏳ Starting to poll for results with rate-limit protection..." )
40
+
41
+ # Initial wait to give the job time to start processing
42
+ await asyncio .sleep (15 )
43
+
44
+ for attempt in range (max_attempts ):
45
+ try :
46
+ result = await client .get_crawl (crawl_id )
47
+ status = result .get ("status" )
48
+
49
+ if status == "success" :
50
+ return result
51
+ elif status == "failed" :
52
+ raise Exception (f"Crawl failed: { result .get ('error' , 'Unknown error' )} " )
53
+ else :
54
+ # Calculate progressive wait time: start at 15s, increase gradually
55
+ base_wait = 15
56
+ progressive_wait = min (60 , base_wait + (attempt * 3 )) # Cap at 60s
57
+
58
+ print (f"⏳ Status: { status } (attempt { attempt + 1 } /{ max_attempts } ) - waiting { progressive_wait } s..." )
59
+ await asyncio .sleep (progressive_wait )
60
+
61
+ except Exception as e :
62
+ if "rate" in str (e ).lower () or "429" in str (e ):
63
+ wait_time = min (90 , 45 + (attempt * 10 ))
64
+ print (f"⚠️ Rate limit detected in error, waiting { wait_time } s..." )
65
+ await asyncio .sleep (wait_time )
66
+ continue
67
+ else :
68
+ print (f"❌ Error polling for results: { e } " )
69
+ if attempt < max_attempts - 1 :
70
+ await asyncio .sleep (20 ) # Wait before retry
71
+ continue
72
+ raise
73
+
74
+ raise Exception (f"⏰ Timeout: Job did not complete after { max_attempts } attempts" )
75
+
76
+
77
+ async def markdown_crawling_example ():
78
+ """
79
+ Markdown Conversion Mode (NO AI/LLM Used)
80
+
81
+ This example demonstrates cost-effective crawling that converts pages to clean markdown
82
+ WITHOUT any AI processing. Perfect for content archival and when you only need clean markdown.
83
+ """
84
+ print ("=" * 60 )
85
+ print ("ASYNC MARKDOWN CONVERSION MODE (NO AI/LLM)" )
86
+ print ("=" * 60 )
87
+ print ("Use case: Get clean markdown content without AI processing" )
88
+ print ("Cost: 2 credits per page (80% savings!)" )
89
+ print ("Features: Clean markdown conversion, metadata extraction" )
90
+ print ("⚠️ NO AI/LLM PROCESSING - Pure HTML to markdown conversion only!" )
91
+ print ()
92
+
93
+ # Initialize the async client
94
+ client = AsyncClient .from_env ()
95
+
96
+ # Target URL for markdown conversion
97
+ url = "https://scrapegraphai.com/"
98
+
99
+ print (f"🌐 Target URL: { url } " )
100
+ print ("🤖 AI Prompt: None (no AI processing)" )
101
+ print ("📊 Crawl Depth: 2" )
102
+ print ("📄 Max Pages: 2" )
103
+ print ("🗺️ Use Sitemap: False" )
104
+ print ("💡 Mode: Pure HTML to markdown conversion" )
105
+ print ()
106
+
107
+ # Start the markdown conversion job
108
+ print ("🚀 Starting markdown conversion job..." )
109
+
110
+ # Call crawl with extraction_mode=False for markdown conversion
111
+ response = await client .crawl (
112
+ url = url ,
113
+ extraction_mode = False , # FALSE = Markdown conversion mode (NO AI/LLM used)
114
+ depth = 2 ,
115
+ max_pages = 2 ,
116
+ same_domain_only = True ,
117
+ sitemap = False , # Use sitemap for better coverage
118
+ # Note: No prompt or data_schema needed when extraction_mode=False
119
+ )
120
+
121
+ crawl_id = response .get ("crawl_id" ) or response .get ("task_id" )
122
+
123
+ if not crawl_id :
124
+ print ("❌ Failed to start markdown conversion job" )
125
+ return
126
+
127
+ print (f"📋 Crawl ID: { crawl_id } " )
128
+ print ("⏳ Polling for results..." )
129
+ print ()
130
+
131
+ # Poll for results with rate-limit protection
132
+ try :
133
+ result = await poll_for_result (client , crawl_id , max_attempts = 20 )
134
+
135
+ print ("✅ Markdown conversion completed successfully!" )
136
+ print ()
137
+
138
+ result_data = result .get ("result" , {})
139
+ pages = result_data .get ("pages" , [])
140
+ crawled_urls = result_data .get ("crawled_urls" , [])
141
+ credits_used = result_data .get ("credits_used" , 0 )
142
+ pages_processed = result_data .get ("pages_processed" , 0 )
143
+
144
+ print ("📊 CONVERSION RESULTS:" )
145
+ print ("-" * 40 )
146
+ print (f"📄 Pages processed: { pages_processed } " )
147
+ print (f"💰 Credits used: { credits_used } " )
148
+ print (f"💵 Cost per page: { credits_used / pages_processed if pages_processed > 0 else 0 :.1f} credits" )
149
+ if crawled_urls :
150
+ print (f"🔗 URLs processed: { crawled_urls } " )
151
+ print ()
152
+
153
+ print ("📝 MARKDOWN CONTENT:" )
154
+ print ("-" * 40 )
155
+ if pages :
156
+ print (f"📄 Total pages with markdown: { len (pages )} " )
157
+ for i , page in enumerate (pages [:3 ]): # Show first 3 pages
158
+ print (f"\n 📄 Page { i + 1 } :" )
159
+ print (f" URL: { page .get ('url' )} " )
160
+ print (f" Title: { page .get ('title' )} " )
161
+
162
+ metadata = page .get ("metadata" , {})
163
+ print (f" 📊 Word count: { metadata .get ('word_count' , 0 )} " )
164
+ print (f" 📋 Headers: { metadata .get ('headers' , [])[:3 ]} " ) # First 3 headers
165
+ print (f" 🔗 Links: { metadata .get ('links_count' , 0 )} " )
166
+
167
+ # Show markdown preview
168
+ markdown_content = page .get ("markdown" , "" )
169
+ markdown_preview = markdown_content [:200 ]
170
+ if len (markdown_content ) > 200 :
171
+ markdown_preview += "..."
172
+ print (f" 📝 Content preview: { markdown_preview } " )
173
+
174
+ if len (pages ) > 3 :
175
+ print (f"\n ... and { len (pages ) - 3 } more pages with markdown content" )
176
+ else :
177
+ print ("No markdown content available" )
178
+
179
+ except Exception as e :
180
+ print (f"❌ Markdown conversion failed: { str (e )} " )
181
+
182
+
183
+ async def main ():
184
+ """Run the async markdown crawling example."""
185
+ print ("🌐 ScrapeGraphAI Async Crawler - Markdown Conversion Example" )
186
+ print ("Cost-effective HTML to Markdown conversion (NO AI/LLM)" )
187
+ print ("=" * 60 )
188
+
189
+ # Check if API key is set
190
+ api_key = os .getenv ("SGAI_API_KEY" )
191
+ if not api_key :
192
+ print ("⚠️ Please set your API key in the environment variable SGAI_API_KEY" )
193
+ print (" export SGAI_API_KEY=your_api_key_here" )
194
+ print ()
195
+ print (" You can get your API key from: https://dashboard.scrapegraphai.com" )
196
+ return
197
+
198
+ print (f"🔑 Using API key: { api_key [:10 ]} ..." )
199
+ print ()
200
+
201
+ # Run the markdown conversion example
202
+ await markdown_crawling_example ()
203
+
204
+ print ("\n " + "=" * 60 )
205
+ print ("🎉 Example completed!" )
206
+ print ("💡 This demonstrates async markdown conversion mode:" )
207
+ print (" • Cost-effective: Only 2 credits per page" )
208
+ print (" • No AI/LLM processing - pure HTML to markdown conversion" )
209
+ print (" • Perfect for content archival and documentation" )
210
+ print (" • 80% cheaper than AI extraction modes!" )
211
+
212
+
213
+ if __name__ == "__main__" :
214
+ asyncio .run (main ())
0 commit comments