Skip to content
This repository was archived by the owner on Aug 25, 2024. It is now read-only.

Commit a90c902

Browse files
authored
feat(webcrawler): improve unchanged pages sourcing (#131)
New flags: - `only-main-content`: default to false, if enabled it will remove script, style (and others) tags from the emitted document. This is particalury helpful in order to verify actual semantic changes to the pages, not related to sldf (script versioning, cache busting, etc) - `emit-content-diff`: list, default to all the content diff. You can filter the content diff you want the source to emit, if available. For example, to not emit content_unchanged, you can set `emit-content-diff: ['new', 'content_diff']`
1 parent 241d16e commit a90c902

File tree

5 files changed

+287
-78
lines changed

5 files changed

+287
-78
lines changed

langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,7 @@
1818
import static ai.langstream.agents.webcrawler.crawler.WebCrawlerConfiguration.DEFAULT_USER_AGENT;
1919
import static ai.langstream.api.util.ConfigurationUtils.*;
2020

21-
import ai.langstream.agents.webcrawler.crawler.Document;
22-
import ai.langstream.agents.webcrawler.crawler.StatusStorage;
23-
import ai.langstream.agents.webcrawler.crawler.WebCrawler;
24-
import ai.langstream.agents.webcrawler.crawler.WebCrawlerConfiguration;
25-
import ai.langstream.agents.webcrawler.crawler.WebCrawlerStatus;
21+
import ai.langstream.agents.webcrawler.crawler.*;
2622
import ai.langstream.ai.agents.commons.state.LocalDiskStateStorage;
2723
import ai.langstream.ai.agents.commons.state.S3StateStorage;
2824
import ai.langstream.ai.agents.commons.state.StateStorage;
@@ -159,20 +155,11 @@ public void init(Map<String, Object> configuration) throws Exception {
159155
entry.getKey(), entry.getValue()))
160156
.collect(Collectors.toUnmodifiableList());
161157

162-
log.info("allowed-domains: {}", allowedDomains);
163-
log.info("forbidden-paths: {}", forbiddenPaths);
164-
log.info("allow-non-html-contents: {}", allowNonHtmlContents);
165-
log.info("seed-urls: {}", seedUrls);
166-
log.info("max-urls: {}", maxUrls);
167-
log.info("max-depth: {}", maxDepth);
168-
log.info("handle-robots-file: {}", handleRobotsFile);
169-
log.info("scan-html-documents: {}", scanHtmlDocuments);
170-
log.info("user-agent: {}", userAgent);
171-
log.info("max-unflushed-pages: {}", maxUnflushedPages);
172-
log.info("min-time-between-requests: {}", minTimeBetweenRequests);
173-
log.info("reindex-interval-seconds: {}", reindexIntervalSeconds);
174-
175-
WebCrawlerConfiguration webCrawlerConfiguration =
158+
final boolean onlyMainContent = getBoolean("only-main-content", false, configuration);
159+
final Set<String> excludeFromMainContentTags =
160+
getSet("exclude-from-main-content-tags", configuration);
161+
162+
WebCrawlerConfiguration.WebCrawlerConfigurationBuilder builder =
176163
WebCrawlerConfiguration.builder()
177164
.allowedDomains(allowedDomains)
178165
.allowNonHtmlContents(allowNonHtmlContents)
@@ -185,16 +172,41 @@ public void init(Map<String, Object> configuration) throws Exception {
185172
.handleCookies(handleCookies)
186173
.httpTimeout(httpTimeout)
187174
.maxErrorCount(maxErrorCount)
188-
.build();
175+
.onlyMainContent(onlyMainContent);
176+
if (!excludeFromMainContentTags.isEmpty()) {
177+
builder.excludeFromMainContentTags(excludeFromMainContentTags);
178+
}
179+
WebCrawlerConfiguration webCrawlerConfiguration = builder.build();
180+
log.info("configuration: {}", webCrawlerConfiguration);
189181

190182
WebCrawlerStatus status = new WebCrawlerStatus();
191183
// this can be overwritten when the status is reloaded
192184
status.setLastIndexStartTimestamp(System.currentTimeMillis());
185+
186+
final List<String> emitContentDiff =
187+
getList("emit-content-diff", configuration).stream()
188+
.map(String::toLowerCase)
189+
.toList();
190+
193191
crawler =
194192
new WebCrawler(
195193
webCrawlerConfiguration,
196194
status,
197-
foundDocuments::add,
195+
new DocumentVisitor() {
196+
@Override
197+
public void visit(Document document) {
198+
if (document.contentDiff() == null
199+
|| emitContentDiff.isEmpty()
200+
|| emitContentDiff.contains(
201+
document.contentDiff().toString().toLowerCase())) {
202+
foundDocuments.add(document);
203+
} else {
204+
log.info(
205+
"Discarding document with content diff {}",
206+
document.contentDiff());
207+
}
208+
}
209+
},
198210
this::sendDeletedDocument);
199211

200212
sourceActivitySummaryTopic =

langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,12 @@ public boolean runCycle() throws Exception {
270270
}
271271
});
272272
}
273+
if (configuration.isOnlyMainContent()) {
274+
for (String excludeFromMainContentTag :
275+
configuration.getExcludeFromMainContentTags()) {
276+
document.getElementsByTag(excludeFromMainContentTag).remove();
277+
}
278+
}
273279
onDocumentFound(current, document.html().getBytes(StandardCharsets.UTF_8), contentType);
274280
}
275281

langstream-agents/langstream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,28 @@ public class WebCrawlerConfiguration {
4040
@Builder.Default private boolean handleRobotsFile = true;
4141
@Builder.Default private boolean scanHtmlDocuments = true;
4242
@Builder.Default private boolean allowNonHtmlContents = false;
43+
@Builder.Default private boolean onlyMainContent = false;
4344

44-
@Builder.Default private Set<String> allowedTags = Set.of("a");
45+
@Builder.Default
46+
private Set<String> excludeFromMainContentTags =
47+
Set.of(
48+
"script",
49+
"style",
50+
"noscript",
51+
"iframe",
52+
"link",
53+
"base",
54+
"meta",
55+
"object",
56+
"embed",
57+
"applet",
58+
"audio",
59+
"video",
60+
"canvas",
61+
"template",
62+
"comment");
63+
64+
@Builder.Default private Set<String> allowedTagsForHtmlDocumentScan = Set.of("a");
4565

4666
public boolean isAllowedUrl(String url) {
4767
final String domainOnly;
@@ -96,6 +116,6 @@ public boolean isAllowedUrl(String url) {
96116
}
97117

98118
public boolean isAllowedTag(String tagName) {
99-
return tagName != null && allowedTags.contains(tagName.toLowerCase());
119+
return tagName != null && allowedTagsForHtmlDocumentScan.contains(tagName.toLowerCase());
100120
}
101121
}

0 commit comments

Comments
 (0)