1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Example script demonstrating the ScrapeGraphAI Crawler markdown conversion mode.
5
+ *
6
+ * This example shows how to use the crawler in markdown conversion mode:
7
+ * - Cost-effective markdown conversion (NO AI/LLM processing)
8
+ * - 2 credits per page (80% savings compared to AI mode)
9
+ * - Clean HTML to markdown conversion with metadata extraction
10
+ *
11
+ * Requirements:
12
+ * - Node.js 14+
13
+ * - dotenv
14
+ * - A .env file with your API_KEY
15
+ *
16
+ * Example .env file:
17
+ * API_KEY=your_api_key_here
18
+ */
19
+
20
+ import 'dotenv/config' ;
21
+
22
+ // Configuration - API key from environment or fallback
23
+ const API_KEY = process . env . TEST_API_KEY || "sgai-xxx" ; // Load from .env file
24
+ const BASE_URL = process . env . BASE_URL || "http://localhost:8001" ; // Can be overridden via env
25
+
26
+ /**
27
+ * Make an HTTP request to the API.
28
+ * @param {string } url - The URL to make the request to
29
+ * @param {Object } data - The data to send in the request body
30
+ * @returns {Promise<Object> } The response JSON
31
+ */
32
+ async function makeRequest ( url , data ) {
33
+ const headers = {
34
+ "Content-Type" : "application/json" ,
35
+ "SGAI-APIKEY" : API_KEY
36
+ } ;
37
+
38
+ const response = await fetch ( url , {
39
+ method : 'POST' ,
40
+ headers : headers ,
41
+ body : JSON . stringify ( data )
42
+ } ) ;
43
+
44
+ return await response . json ( ) ;
45
+ }
46
+
47
+ /**
48
+ * Poll for the result of a crawl job with rate limit handling.
49
+ * @param {string } taskId - The task ID to poll for
50
+ * @returns {Promise<Object> } The response JSON
51
+ */
52
+ async function pollResult ( taskId ) {
53
+ const headers = { "SGAI-APIKEY" : API_KEY } ;
54
+ const url = `${ BASE_URL } /v1/crawl/${ taskId } ` ;
55
+
56
+ const response = await fetch ( url , {
57
+ method : 'GET' ,
58
+ headers : headers
59
+ } ) ;
60
+
61
+ if ( response . status === 429 ) {
62
+ // Rate limited - return special status to handle in polling loop
63
+ return { status : "rate_limited" , retry_after : 60 } ;
64
+ }
65
+
66
+ return await response . json ( ) ;
67
+ }
68
+
69
+ /**
70
+ * Poll for crawl results with intelligent backoff to avoid rate limits.
71
+ * @param {string } taskId - The task ID to poll for
72
+ * @param {number } maxAttempts - Maximum number of polling attempts
73
+ * @returns {Promise<Object> } The final result or throws an exception on timeout/failure
74
+ */
75
+ async function pollWithBackoff ( taskId , maxAttempts = 20 ) {
76
+ console . log ( "⏳ Starting to poll for results with rate-limit protection..." ) ;
77
+
78
+ // Initial wait to give the job time to start processing
79
+ await new Promise ( resolve => setTimeout ( resolve , 15000 ) ) ;
80
+
81
+ for ( let attempt = 0 ; attempt < maxAttempts ; attempt ++ ) {
82
+ try {
83
+ const result = await pollResult ( taskId ) ;
84
+ const status = result . status ;
85
+
86
+ if ( status === "rate_limited" ) {
87
+ const waitTime = Math . min ( 90 , 30 + ( attempt * 10 ) ) ; // Exponential backoff for rate limits
88
+ console . log ( `⚠️ Rate limited! Waiting ${ waitTime } s before retry...` ) ;
89
+ await new Promise ( resolve => setTimeout ( resolve , waitTime * 1000 ) ) ;
90
+ continue ;
91
+ } else if ( status === "success" ) {
92
+ return result ;
93
+ } else if ( status === "failed" ) {
94
+ throw new Error ( `Crawl failed: ${ result . error || 'Unknown error' } ` ) ;
95
+ } else {
96
+ // Calculate progressive wait time: start at 15s, increase gradually
97
+ const baseWait = 15 ;
98
+ const progressiveWait = Math . min ( 60 , baseWait + ( attempt * 3 ) ) ; // Cap at 60s
99
+
100
+ console . log ( `⏳ Status: ${ status } (attempt ${ attempt + 1 } /${ maxAttempts } ) - waiting ${ progressiveWait } s...` ) ;
101
+ await new Promise ( resolve => setTimeout ( resolve , progressiveWait * 1000 ) ) ;
102
+ }
103
+ } catch ( error ) {
104
+ if ( error . message . toLowerCase ( ) . includes ( 'rate' ) || error . message . includes ( '429' ) ) {
105
+ const waitTime = Math . min ( 90 , 45 + ( attempt * 10 ) ) ;
106
+ console . log ( `⚠️ Rate limit detected in error, waiting ${ waitTime } s...` ) ;
107
+ await new Promise ( resolve => setTimeout ( resolve , waitTime * 1000 ) ) ;
108
+ continue ;
109
+ } else {
110
+ console . log ( `❌ Error polling for results: ${ error . message } ` ) ;
111
+ if ( attempt < maxAttempts - 1 ) {
112
+ await new Promise ( resolve => setTimeout ( resolve , 20000 ) ) ; // Wait before retry
113
+ continue ;
114
+ }
115
+ throw error ;
116
+ }
117
+ }
118
+ }
119
+
120
+ throw new Error ( `⏰ Timeout: Job did not complete after ${ maxAttempts } attempts` ) ;
121
+ }
122
+
123
+ /**
124
+ * Markdown Conversion Mode (NO AI/LLM Used)
125
+ *
126
+ * This example demonstrates cost-effective crawling that converts pages to clean markdown
127
+ * WITHOUT any AI processing. Perfect for content archival and when you only need clean markdown.
128
+ */
129
+ async function markdownCrawlingExample ( ) {
130
+ console . log ( "=" . repeat ( 60 ) ) ;
131
+ console . log ( "MARKDOWN CONVERSION MODE (NO AI/LLM)" ) ;
132
+ console . log ( "=" . repeat ( 60 ) ) ;
133
+ console . log ( "Use case: Get clean markdown content without AI processing" ) ;
134
+ console . log ( "Cost: 2 credits per page (80% savings!)" ) ;
135
+ console . log ( "Features: Clean markdown conversion, metadata extraction" ) ;
136
+ console . log ( "⚠️ NO AI/LLM PROCESSING - Pure HTML to markdown conversion only!" ) ;
137
+ console . log ( ) ;
138
+
139
+ // Markdown conversion request - NO AI/LLM processing
140
+ const requestData = {
141
+ url : "https://scrapegraphai.com/" ,
142
+ extraction_mode : false , // FALSE = Markdown conversion mode (NO AI/LLM used)
143
+ depth : 2 ,
144
+ max_pages : 2 ,
145
+ same_domain_only : true ,
146
+ sitemap : false , // Use sitemap for better coverage
147
+ // Note: No prompt needed when extraction_mode = false
148
+ } ;
149
+
150
+ console . log ( `🌐 Target URL: ${ requestData . url } ` ) ;
151
+ console . log ( "🤖 AI Prompt: None (no AI processing)" ) ;
152
+ console . log ( `📊 Crawl Depth: ${ requestData . depth } ` ) ;
153
+ console . log ( `📄 Max Pages: ${ requestData . max_pages } ` ) ;
154
+ console . log ( `🗺️ Use Sitemap: ${ requestData . sitemap } ` ) ;
155
+ console . log ( "💡 Mode: Pure HTML to markdown conversion" ) ;
156
+ console . log ( ) ;
157
+
158
+ // Start the markdown conversion job
159
+ console . log ( "🚀 Starting markdown conversion job..." ) ;
160
+ const response = await makeRequest ( `${ BASE_URL } /v1/crawl` , requestData ) ;
161
+ const taskId = response . task_id ;
162
+
163
+ if ( ! taskId ) {
164
+ console . log ( "❌ Failed to start markdown conversion job" ) ;
165
+ return ;
166
+ }
167
+
168
+ console . log ( `📋 Task ID: ${ taskId } ` ) ;
169
+ console . log ( "⏳ Polling for results..." ) ;
170
+ console . log ( ) ;
171
+
172
+ // Poll for results with rate-limit protection
173
+ try {
174
+ const result = await pollWithBackoff ( taskId , 20 ) ;
175
+
176
+ console . log ( "✅ Markdown conversion completed successfully!" ) ;
177
+ console . log ( ) ;
178
+
179
+ const resultData = result . result || { } ;
180
+ const pages = resultData . pages || [ ] ;
181
+ const crawledUrls = resultData . crawled_urls || [ ] ;
182
+ const creditsUsed = resultData . credits_used || 0 ;
183
+ const pagesProcessed = resultData . pages_processed || 0 ;
184
+
185
+ console . log ( "📊 CONVERSION RESULTS:" ) ;
186
+ console . log ( "-" . repeat ( 40 ) ) ;
187
+ console . log ( `📄 Pages processed: ${ pagesProcessed } ` ) ;
188
+ console . log ( `💰 Credits used: ${ creditsUsed } ` ) ;
189
+ console . log ( `💵 Cost per page: ${ pagesProcessed > 0 ? ( creditsUsed / pagesProcessed ) . toFixed ( 1 ) : 0 } credits` ) ;
190
+ if ( crawledUrls . length > 0 ) {
191
+ console . log ( `🔗 URLs processed: ${ JSON . stringify ( crawledUrls ) } ` ) ;
192
+ }
193
+ console . log ( ) ;
194
+
195
+ console . log ( "📝 MARKDOWN CONTENT:" ) ;
196
+ console . log ( "-" . repeat ( 40 ) ) ;
197
+ if ( pages . length > 0 ) {
198
+ console . log ( `📄 Total pages with markdown: ${ pages . length } ` ) ;
199
+ pages . slice ( 0 , 3 ) . forEach ( ( page , i ) => { // Show first 3 pages
200
+ console . log ( `\n📄 Page ${ i + 1 } :` ) ;
201
+ console . log ( ` URL: ${ page . url || 'N/A' } ` ) ;
202
+ console . log ( ` Title: ${ page . title || 'None' } ` ) ;
203
+
204
+ const metadata = page . metadata || { } ;
205
+ console . log ( ` 📊 Word count: ${ metadata . word_count || 0 } ` ) ;
206
+ console . log ( ` 📋 Headers: ${ JSON . stringify ( ( metadata . headers || [ ] ) . slice ( 0 , 3 ) ) } ` ) ; // First 3 headers
207
+ console . log ( ` 🔗 Links: ${ metadata . links_count || 0 } ` ) ;
208
+
209
+ // Show markdown preview
210
+ const markdownContent = page . markdown || "" ;
211
+ let markdownPreview = markdownContent . substring ( 0 , 200 ) ;
212
+ if ( markdownContent . length > 200 ) {
213
+ markdownPreview += "..." ;
214
+ }
215
+ console . log ( ` 📝 Content preview: ${ markdownPreview } ` ) ;
216
+ } ) ;
217
+
218
+ if ( pages . length > 3 ) {
219
+ console . log ( `\n ... and ${ pages . length - 3 } more pages with markdown content` ) ;
220
+ }
221
+ } else {
222
+ console . log ( "No markdown content available" ) ;
223
+ }
224
+
225
+ } catch ( error ) {
226
+ console . log ( `❌ Markdown conversion failed: ${ error . message } ` ) ;
227
+ }
228
+ }
229
+
230
+ /**
231
+ * Main function to run the markdown crawling example.
232
+ */
233
+ async function main ( ) {
234
+ console . log ( "🌐 ScrapeGraphAI Crawler - Markdown Conversion Example" ) ;
235
+ console . log ( "Cost-effective HTML to Markdown conversion (NO AI/LLM)" ) ;
236
+ console . log ( "=" . repeat ( 60 ) ) ;
237
+
238
+ // Check if API key is set
239
+ if ( API_KEY === "sgai-xxx" ) {
240
+ console . log ( "⚠️ Please set your API key in the .env file" ) ;
241
+ console . log ( " Create a .env file with your API key:" ) ;
242
+ console . log ( " API_KEY=your_api_key_here" ) ;
243
+ console . log ( ) ;
244
+ console . log ( " You can get your API key from: https://dashboard.scrapegraphai.com" ) ;
245
+ console . log ( ) ;
246
+ console . log ( " Example .env file:" ) ;
247
+ console . log ( " API_KEY=sgai-your-actual-api-key-here" ) ;
248
+ console . log ( " BASE_URL=https://api.scrapegraphai.com # Optional" ) ;
249
+ return ;
250
+ }
251
+
252
+ console . log ( `🔑 Using API key: ${ API_KEY . substring ( 0 , 10 ) } ...` ) ;
253
+ console . log ( `🌐 Base URL: ${ BASE_URL } ` ) ;
254
+ console . log ( ) ;
255
+
256
+ // Run the single example
257
+ await markdownCrawlingExample ( ) ; // Markdown conversion mode (NO AI)
258
+
259
+ console . log ( "\n" + "=" . repeat ( 60 ) ) ;
260
+ console . log ( "🎉 Example completed!" ) ;
261
+ console . log ( "💡 This demonstrates markdown conversion mode:" ) ;
262
+ console . log ( " • Cost-effective: Only 2 credits per page" ) ;
263
+ console . log ( " • No AI/LLM processing - pure HTML to markdown conversion" ) ;
264
+ console . log ( " • Perfect for content archival and documentation" ) ;
265
+ console . log ( " • 80% cheaper than AI extraction modes!" ) ;
266
+ }
267
+
268
+ // Run the example
269
+ main ( ) . catch ( console . error ) ;
0 commit comments