Skip to content

Commit 1a9053e

Browse files
committed
feat: add js files
1 parent e5d573b commit 1a9053e

File tree

5 files changed

+1029
-32
lines changed

5 files changed

+1029
-32
lines changed
Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
#!/usr/bin/env node
2+
3+
/**
4+
* Example script demonstrating the ScrapeGraphAI Crawler markdown conversion mode.
5+
*
6+
* This example shows how to use the crawler in markdown conversion mode:
7+
* - Cost-effective markdown conversion (NO AI/LLM processing)
8+
* - 2 credits per page (80% savings compared to AI mode)
9+
* - Clean HTML to markdown conversion with metadata extraction
10+
*
11+
* Requirements:
12+
* - Node.js 14+
13+
* - dotenv
14+
* - A .env file with your API_KEY
15+
*
16+
* Example .env file:
17+
* API_KEY=your_api_key_here
18+
*/
19+
20+
import 'dotenv/config';
21+
22+
// Configuration - API key from environment or fallback
23+
const API_KEY = process.env.TEST_API_KEY || "sgai-xxx"; // Load from .env file
24+
const BASE_URL = process.env.BASE_URL || "http://localhost:8001"; // Can be overridden via env
25+
26+
/**
27+
* Make an HTTP request to the API.
28+
* @param {string} url - The URL to make the request to
29+
* @param {Object} data - The data to send in the request body
30+
* @returns {Promise<Object>} The response JSON
31+
*/
32+
async function makeRequest(url, data) {
33+
const headers = {
34+
"Content-Type": "application/json",
35+
"SGAI-APIKEY": API_KEY
36+
};
37+
38+
const response = await fetch(url, {
39+
method: 'POST',
40+
headers: headers,
41+
body: JSON.stringify(data)
42+
});
43+
44+
return await response.json();
45+
}
46+
47+
/**
48+
* Poll for the result of a crawl job with rate limit handling.
49+
* @param {string} taskId - The task ID to poll for
50+
* @returns {Promise<Object>} The response JSON
51+
*/
52+
async function pollResult(taskId) {
53+
const headers = { "SGAI-APIKEY": API_KEY };
54+
const url = `${BASE_URL}/v1/crawl/${taskId}`;
55+
56+
const response = await fetch(url, {
57+
method: 'GET',
58+
headers: headers
59+
});
60+
61+
if (response.status === 429) {
62+
// Rate limited - return special status to handle in polling loop
63+
return { status: "rate_limited", retry_after: 60 };
64+
}
65+
66+
return await response.json();
67+
}
68+
69+
/**
70+
* Poll for crawl results with intelligent backoff to avoid rate limits.
71+
* @param {string} taskId - The task ID to poll for
72+
* @param {number} maxAttempts - Maximum number of polling attempts
73+
* @returns {Promise<Object>} The final result or throws an exception on timeout/failure
74+
*/
75+
async function pollWithBackoff(taskId, maxAttempts = 20) {
76+
console.log("⏳ Starting to poll for results with rate-limit protection...");
77+
78+
// Initial wait to give the job time to start processing
79+
await new Promise(resolve => setTimeout(resolve, 15000));
80+
81+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
82+
try {
83+
const result = await pollResult(taskId);
84+
const status = result.status;
85+
86+
if (status === "rate_limited") {
87+
const waitTime = Math.min(90, 30 + (attempt * 10)); // Exponential backoff for rate limits
88+
console.log(`⚠️ Rate limited! Waiting ${waitTime}s before retry...`);
89+
await new Promise(resolve => setTimeout(resolve, waitTime * 1000));
90+
continue;
91+
} else if (status === "success") {
92+
return result;
93+
} else if (status === "failed") {
94+
throw new Error(`Crawl failed: ${result.error || 'Unknown error'}`);
95+
} else {
96+
// Calculate progressive wait time: start at 15s, increase gradually
97+
const baseWait = 15;
98+
const progressiveWait = Math.min(60, baseWait + (attempt * 3)); // Cap at 60s
99+
100+
console.log(`⏳ Status: ${status} (attempt ${attempt + 1}/${maxAttempts}) - waiting ${progressiveWait}s...`);
101+
await new Promise(resolve => setTimeout(resolve, progressiveWait * 1000));
102+
}
103+
} catch (error) {
104+
if (error.message.toLowerCase().includes('rate') || error.message.includes('429')) {
105+
const waitTime = Math.min(90, 45 + (attempt * 10));
106+
console.log(`⚠️ Rate limit detected in error, waiting ${waitTime}s...`);
107+
await new Promise(resolve => setTimeout(resolve, waitTime * 1000));
108+
continue;
109+
} else {
110+
console.log(`❌ Error polling for results: ${error.message}`);
111+
if (attempt < maxAttempts - 1) {
112+
await new Promise(resolve => setTimeout(resolve, 20000)); // Wait before retry
113+
continue;
114+
}
115+
throw error;
116+
}
117+
}
118+
}
119+
120+
throw new Error(`⏰ Timeout: Job did not complete after ${maxAttempts} attempts`);
121+
}
122+
123+
/**
124+
* Markdown Conversion Mode (NO AI/LLM Used)
125+
*
126+
* This example demonstrates cost-effective crawling that converts pages to clean markdown
127+
* WITHOUT any AI processing. Perfect for content archival and when you only need clean markdown.
128+
*/
129+
async function markdownCrawlingExample() {
130+
console.log("=".repeat(60));
131+
console.log("MARKDOWN CONVERSION MODE (NO AI/LLM)");
132+
console.log("=".repeat(60));
133+
console.log("Use case: Get clean markdown content without AI processing");
134+
console.log("Cost: 2 credits per page (80% savings!)");
135+
console.log("Features: Clean markdown conversion, metadata extraction");
136+
console.log("⚠️ NO AI/LLM PROCESSING - Pure HTML to markdown conversion only!");
137+
console.log();
138+
139+
// Markdown conversion request - NO AI/LLM processing
140+
const requestData = {
141+
url: "https://scrapegraphai.com/",
142+
extraction_mode: false, // FALSE = Markdown conversion mode (NO AI/LLM used)
143+
depth: 2,
144+
max_pages: 2,
145+
same_domain_only: true,
146+
sitemap: false, // Use sitemap for better coverage
147+
// Note: No prompt needed when extraction_mode = false
148+
};
149+
150+
console.log(`🌐 Target URL: ${requestData.url}`);
151+
console.log("🤖 AI Prompt: None (no AI processing)");
152+
console.log(`📊 Crawl Depth: ${requestData.depth}`);
153+
console.log(`📄 Max Pages: ${requestData.max_pages}`);
154+
console.log(`🗺️ Use Sitemap: ${requestData.sitemap}`);
155+
console.log("💡 Mode: Pure HTML to markdown conversion");
156+
console.log();
157+
158+
// Start the markdown conversion job
159+
console.log("🚀 Starting markdown conversion job...");
160+
const response = await makeRequest(`${BASE_URL}/v1/crawl`, requestData);
161+
const taskId = response.task_id;
162+
163+
if (!taskId) {
164+
console.log("❌ Failed to start markdown conversion job");
165+
return;
166+
}
167+
168+
console.log(`📋 Task ID: ${taskId}`);
169+
console.log("⏳ Polling for results...");
170+
console.log();
171+
172+
// Poll for results with rate-limit protection
173+
try {
174+
const result = await pollWithBackoff(taskId, 20);
175+
176+
console.log("✅ Markdown conversion completed successfully!");
177+
console.log();
178+
179+
const resultData = result.result || {};
180+
const pages = resultData.pages || [];
181+
const crawledUrls = resultData.crawled_urls || [];
182+
const creditsUsed = resultData.credits_used || 0;
183+
const pagesProcessed = resultData.pages_processed || 0;
184+
185+
console.log("📊 CONVERSION RESULTS:");
186+
console.log("-".repeat(40));
187+
console.log(`📄 Pages processed: ${pagesProcessed}`);
188+
console.log(`💰 Credits used: ${creditsUsed}`);
189+
console.log(`💵 Cost per page: ${pagesProcessed > 0 ? (creditsUsed / pagesProcessed).toFixed(1) : 0} credits`);
190+
if (crawledUrls.length > 0) {
191+
console.log(`🔗 URLs processed: ${JSON.stringify(crawledUrls)}`);
192+
}
193+
console.log();
194+
195+
console.log("📝 MARKDOWN CONTENT:");
196+
console.log("-".repeat(40));
197+
if (pages.length > 0) {
198+
console.log(`📄 Total pages with markdown: ${pages.length}`);
199+
pages.slice(0, 3).forEach((page, i) => { // Show first 3 pages
200+
console.log(`\n📄 Page ${i + 1}:`);
201+
console.log(` URL: ${page.url || 'N/A'}`);
202+
console.log(` Title: ${page.title || 'None'}`);
203+
204+
const metadata = page.metadata || {};
205+
console.log(` 📊 Word count: ${metadata.word_count || 0}`);
206+
console.log(` 📋 Headers: ${JSON.stringify((metadata.headers || []).slice(0, 3))}`); // First 3 headers
207+
console.log(` 🔗 Links: ${metadata.links_count || 0}`);
208+
209+
// Show markdown preview
210+
const markdownContent = page.markdown || "";
211+
let markdownPreview = markdownContent.substring(0, 200);
212+
if (markdownContent.length > 200) {
213+
markdownPreview += "...";
214+
}
215+
console.log(` 📝 Content preview: ${markdownPreview}`);
216+
});
217+
218+
if (pages.length > 3) {
219+
console.log(`\n ... and ${pages.length - 3} more pages with markdown content`);
220+
}
221+
} else {
222+
console.log("No markdown content available");
223+
}
224+
225+
} catch (error) {
226+
console.log(`❌ Markdown conversion failed: ${error.message}`);
227+
}
228+
}
229+
230+
/**
231+
* Main function to run the markdown crawling example.
232+
*/
233+
async function main() {
234+
console.log("🌐 ScrapeGraphAI Crawler - Markdown Conversion Example");
235+
console.log("Cost-effective HTML to Markdown conversion (NO AI/LLM)");
236+
console.log("=".repeat(60));
237+
238+
// Check if API key is set
239+
if (API_KEY === "sgai-xxx") {
240+
console.log("⚠️ Please set your API key in the .env file");
241+
console.log(" Create a .env file with your API key:");
242+
console.log(" API_KEY=your_api_key_here");
243+
console.log();
244+
console.log(" You can get your API key from: https://dashboard.scrapegraphai.com");
245+
console.log();
246+
console.log(" Example .env file:");
247+
console.log(" API_KEY=sgai-your-actual-api-key-here");
248+
console.log(" BASE_URL=https://api.scrapegraphai.com # Optional");
249+
return;
250+
}
251+
252+
console.log(`🔑 Using API key: ${API_KEY.substring(0, 10)}...`);
253+
console.log(`🌐 Base URL: ${BASE_URL}`);
254+
console.log();
255+
256+
// Run the single example
257+
await markdownCrawlingExample(); // Markdown conversion mode (NO AI)
258+
259+
console.log("\n" + "=".repeat(60));
260+
console.log("🎉 Example completed!");
261+
console.log("💡 This demonstrates markdown conversion mode:");
262+
console.log(" • Cost-effective: Only 2 credits per page");
263+
console.log(" • No AI/LLM processing - pure HTML to markdown conversion");
264+
console.log(" • Perfect for content archival and documentation");
265+
console.log(" • 80% cheaper than AI extraction modes!");
266+
}
267+
268+
// Run the example
269+
main().catch(console.error);

0 commit comments

Comments
 (0)