Skip to content

Commit e5d573b

Browse files
committed
feat: add crawling markdown
1 parent 733b669 commit e5d573b

File tree

8 files changed

+714
-39
lines changed

8 files changed

+714
-39
lines changed

scrapegraph-py/README.md

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ pip install scrapegraph-py
2121
## 🚀 Features
2222

2323
- 🤖 AI-powered web scraping and search
24+
- 🕷️ Smart crawling with both AI extraction and markdown conversion modes
25+
- 💰 Cost-effective markdown conversion (80% savings vs AI mode)
2426
- 🔄 Both sync and async clients
2527
- 📊 Structured output with Pydantic schemas
2628
- 🔍 Detailed logging
@@ -219,6 +221,95 @@ response = client.markdownify(
219221
print(response)
220222
```
221223

224+
### 🕷️ Crawler
225+
226+
Intelligently crawl and extract data from multiple pages with support for both AI extraction and markdown conversion modes.
227+
228+
#### AI Extraction Mode (Default)
229+
Extract structured data from multiple pages using AI:
230+
231+
```python
232+
from scrapegraph_py import Client
233+
234+
client = Client(api_key="your-api-key-here")
235+
236+
# Define the data schema for extraction
237+
schema = {
238+
"type": "object",
239+
"properties": {
240+
"company_name": {"type": "string"},
241+
"founders": {
242+
"type": "array",
243+
"items": {"type": "string"}
244+
},
245+
"description": {"type": "string"}
246+
}
247+
}
248+
249+
response = client.crawl(
250+
url="https://scrapegraphai.com",
251+
prompt="extract the company information and founders",
252+
data_schema=schema,
253+
depth=2,
254+
max_pages=5,
255+
same_domain_only=True
256+
)
257+
258+
# Poll for results (crawl is asynchronous)
259+
crawl_id = response.get("crawl_id")
260+
result = client.get_crawl(crawl_id)
261+
```
262+
263+
#### Markdown Conversion Mode (Cost-Effective)
264+
Convert pages to clean markdown without AI processing (80% cheaper):
265+
266+
```python
267+
from scrapegraph_py import Client
268+
269+
client = Client(api_key="your-api-key-here")
270+
271+
response = client.crawl(
272+
url="https://scrapegraphai.com",
273+
extraction_mode=False, # Markdown conversion mode
274+
depth=2,
275+
max_pages=5,
276+
same_domain_only=True,
277+
sitemap=True # Use sitemap for better page discovery
278+
)
279+
280+
# Poll for results
281+
crawl_id = response.get("crawl_id")
282+
result = client.get_crawl(crawl_id)
283+
284+
# Access markdown content
285+
for page in result["result"]["pages"]:
286+
print(f"URL: {page['url']}")
287+
print(f"Markdown: {page['markdown']}")
288+
print(f"Metadata: {page['metadata']}")
289+
```
290+
291+
<details>
292+
<summary>🔧 Crawl Parameters</summary>
293+
294+
- **url** (required): Starting URL for the crawl
295+
- **extraction_mode** (default: True):
296+
- `True` = AI extraction mode (requires prompt and data_schema)
297+
- `False` = Markdown conversion mode (no AI, 80% cheaper)
298+
- **prompt** (required for AI mode): AI prompt to guide data extraction
299+
- **data_schema** (required for AI mode): JSON schema defining extracted data structure
300+
- **depth** (default: 2): Maximum crawl depth (1-10)
301+
- **max_pages** (default: 2): Maximum pages to crawl (1-100)
302+
- **same_domain_only** (default: True): Only crawl pages from the same domain
303+
- **sitemap** (default: False): Use sitemap for better page discovery
304+
- **cache_website** (default: True): Cache website content
305+
- **batch_size** (optional): Batch size for processing pages (1-10)
306+
307+
**Cost Comparison:**
308+
- AI Extraction Mode: ~10 credits per page
309+
- Markdown Conversion Mode: ~2 credits per page (80% savings!)
310+
311+
</details>
312+
222313
## ⚡ Async Support
223314

224315
All endpoints support async operations:
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Async example demonstrating the ScrapeGraphAI Crawler markdown conversion mode.
4+
5+
This example shows how to use the async crawler in markdown conversion mode:
6+
- Cost-effective markdown conversion (NO AI/LLM processing)
7+
- 2 credits per page (80% savings compared to AI mode)
8+
- Clean HTML to markdown conversion with metadata extraction
9+
10+
Requirements:
11+
- Python 3.7+
12+
- scrapegraph-py
13+
- aiohttp (installed with scrapegraph-py)
14+
- A valid API key
15+
16+
Usage:
17+
python async_crawl_markdown_example.py
18+
"""
19+
20+
import asyncio
21+
import os
22+
from typing import Dict, Any
23+
24+
from scrapegraph_py import AsyncClient
25+
26+
27+
async def poll_for_result(client: AsyncClient, crawl_id: str, max_attempts: int = 20) -> Dict[str, Any]:
28+
"""
29+
Poll for crawl results with intelligent backoff to avoid rate limits.
30+
31+
Args:
32+
client: The async ScrapeGraph client
33+
crawl_id: The crawl ID to poll for
34+
max_attempts: Maximum number of polling attempts
35+
36+
Returns:
37+
The final result or raises an exception on timeout/failure
38+
"""
39+
print("⏳ Starting to poll for results with rate-limit protection...")
40+
41+
# Initial wait to give the job time to start processing
42+
await asyncio.sleep(15)
43+
44+
for attempt in range(max_attempts):
45+
try:
46+
result = await client.get_crawl(crawl_id)
47+
status = result.get("status")
48+
49+
if status == "success":
50+
return result
51+
elif status == "failed":
52+
raise Exception(f"Crawl failed: {result.get('error', 'Unknown error')}")
53+
else:
54+
# Calculate progressive wait time: start at 15s, increase gradually
55+
base_wait = 15
56+
progressive_wait = min(60, base_wait + (attempt * 3)) # Cap at 60s
57+
58+
print(f"⏳ Status: {status} (attempt {attempt + 1}/{max_attempts}) - waiting {progressive_wait}s...")
59+
await asyncio.sleep(progressive_wait)
60+
61+
except Exception as e:
62+
if "rate" in str(e).lower() or "429" in str(e):
63+
wait_time = min(90, 45 + (attempt * 10))
64+
print(f"⚠️ Rate limit detected in error, waiting {wait_time}s...")
65+
await asyncio.sleep(wait_time)
66+
continue
67+
else:
68+
print(f"❌ Error polling for results: {e}")
69+
if attempt < max_attempts - 1:
70+
await asyncio.sleep(20) # Wait before retry
71+
continue
72+
raise
73+
74+
raise Exception(f"⏰ Timeout: Job did not complete after {max_attempts} attempts")
75+
76+
77+
async def markdown_crawling_example():
78+
"""
79+
Markdown Conversion Mode (NO AI/LLM Used)
80+
81+
This example demonstrates cost-effective crawling that converts pages to clean markdown
82+
WITHOUT any AI processing. Perfect for content archival and when you only need clean markdown.
83+
"""
84+
print("=" * 60)
85+
print("ASYNC MARKDOWN CONVERSION MODE (NO AI/LLM)")
86+
print("=" * 60)
87+
print("Use case: Get clean markdown content without AI processing")
88+
print("Cost: 2 credits per page (80% savings!)")
89+
print("Features: Clean markdown conversion, metadata extraction")
90+
print("⚠️ NO AI/LLM PROCESSING - Pure HTML to markdown conversion only!")
91+
print()
92+
93+
# Initialize the async client
94+
client = AsyncClient.from_env()
95+
96+
# Target URL for markdown conversion
97+
url = "https://scrapegraphai.com/"
98+
99+
print(f"🌐 Target URL: {url}")
100+
print("🤖 AI Prompt: None (no AI processing)")
101+
print("📊 Crawl Depth: 2")
102+
print("📄 Max Pages: 2")
103+
print("🗺️ Use Sitemap: False")
104+
print("💡 Mode: Pure HTML to markdown conversion")
105+
print()
106+
107+
# Start the markdown conversion job
108+
print("🚀 Starting markdown conversion job...")
109+
110+
# Call crawl with extraction_mode=False for markdown conversion
111+
response = await client.crawl(
112+
url=url,
113+
extraction_mode=False, # FALSE = Markdown conversion mode (NO AI/LLM used)
114+
depth=2,
115+
max_pages=2,
116+
same_domain_only=True,
117+
sitemap=False, # Use sitemap for better coverage
118+
# Note: No prompt or data_schema needed when extraction_mode=False
119+
)
120+
121+
crawl_id = response.get("crawl_id") or response.get("task_id")
122+
123+
if not crawl_id:
124+
print("❌ Failed to start markdown conversion job")
125+
return
126+
127+
print(f"📋 Crawl ID: {crawl_id}")
128+
print("⏳ Polling for results...")
129+
print()
130+
131+
# Poll for results with rate-limit protection
132+
try:
133+
result = await poll_for_result(client, crawl_id, max_attempts=20)
134+
135+
print("✅ Markdown conversion completed successfully!")
136+
print()
137+
138+
result_data = result.get("result", {})
139+
pages = result_data.get("pages", [])
140+
crawled_urls = result_data.get("crawled_urls", [])
141+
credits_used = result_data.get("credits_used", 0)
142+
pages_processed = result_data.get("pages_processed", 0)
143+
144+
print("📊 CONVERSION RESULTS:")
145+
print("-" * 40)
146+
print(f"📄 Pages processed: {pages_processed}")
147+
print(f"💰 Credits used: {credits_used}")
148+
print(f"💵 Cost per page: {credits_used/pages_processed if pages_processed > 0 else 0:.1f} credits")
149+
if crawled_urls:
150+
print(f"🔗 URLs processed: {crawled_urls}")
151+
print()
152+
153+
print("📝 MARKDOWN CONTENT:")
154+
print("-" * 40)
155+
if pages:
156+
print(f"📄 Total pages with markdown: {len(pages)}")
157+
for i, page in enumerate(pages[:3]): # Show first 3 pages
158+
print(f"\n📄 Page {i+1}:")
159+
print(f" URL: {page.get('url')}")
160+
print(f" Title: {page.get('title')}")
161+
162+
metadata = page.get("metadata", {})
163+
print(f" 📊 Word count: {metadata.get('word_count', 0)}")
164+
print(f" 📋 Headers: {metadata.get('headers', [])[:3]}") # First 3 headers
165+
print(f" 🔗 Links: {metadata.get('links_count', 0)}")
166+
167+
# Show markdown preview
168+
markdown_content = page.get("markdown", "")
169+
markdown_preview = markdown_content[:200]
170+
if len(markdown_content) > 200:
171+
markdown_preview += "..."
172+
print(f" 📝 Content preview: {markdown_preview}")
173+
174+
if len(pages) > 3:
175+
print(f"\n ... and {len(pages) - 3} more pages with markdown content")
176+
else:
177+
print("No markdown content available")
178+
179+
except Exception as e:
180+
print(f"❌ Markdown conversion failed: {str(e)}")
181+
182+
183+
async def main():
184+
"""Run the async markdown crawling example."""
185+
print("🌐 ScrapeGraphAI Async Crawler - Markdown Conversion Example")
186+
print("Cost-effective HTML to Markdown conversion (NO AI/LLM)")
187+
print("=" * 60)
188+
189+
# Check if API key is set
190+
api_key = os.getenv("SGAI_API_KEY")
191+
if not api_key:
192+
print("⚠️ Please set your API key in the environment variable SGAI_API_KEY")
193+
print(" export SGAI_API_KEY=your_api_key_here")
194+
print()
195+
print(" You can get your API key from: https://dashboard.scrapegraphai.com")
196+
return
197+
198+
print(f"🔑 Using API key: {api_key[:10]}...")
199+
print()
200+
201+
# Run the markdown conversion example
202+
await markdown_crawling_example()
203+
204+
print("\n" + "=" * 60)
205+
print("🎉 Example completed!")
206+
print("💡 This demonstrates async markdown conversion mode:")
207+
print(" • Cost-effective: Only 2 credits per page")
208+
print(" • No AI/LLM processing - pure HTML to markdown conversion")
209+
print(" • Perfect for content archival and documentation")
210+
print(" • 80% cheaper than AI extraction modes!")
211+
212+
213+
if __name__ == "__main__":
214+
asyncio.run(main())

0 commit comments

Comments
 (0)