Skip to content

Commit b38fb2b

Browse files
Addition of a lambda for web crawling to create index
1 parent 4685d35 commit b38fb2b

File tree

7 files changed

+222
-1
lines changed

7 files changed

+222
-1
lines changed

src/lex-gen-ai-demo-cdk/app.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import aws_cdk as cdk
33

44
from lex_gen_ai_demo_cdk_files.lex_gen_ai_demo_cdk_files_stack import LexGenAIDemoFilesStack
5+
from create_web_crawler_lambda import LambdaStack
56
from endpoint_handler import create_endpoint_from_HF_image
67

78
# create_endpoint_from_HF_image(hf_model_id, instance_type="ml.g5.8xlarge", endpoint_name=SAGEMAKER_ENDPOINT_NAME, number_of_gpu=1)
@@ -10,5 +11,6 @@
1011

1112
app = cdk.App()
1213
filestack = LexGenAIDemoFilesStack(app, "LexGenAIDemoFilesStack")
14+
web_crawler_lambda_stack = LambdaStack(app, 'LexGenAIDemoFilesStack-Webcrawler')
1315

1416
app.synth()
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from aws_cdk import (
2+
Duration, Stack,
3+
aws_lambda as lambda_,
4+
aws_s3 as s3,
5+
aws_iam as iam
6+
)
7+
8+
from constructs import Construct
9+
10+
class LambdaStack(Stack):
11+
12+
def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
13+
super().__init__(scope, construct_id, **kwargs)
14+
# Iam role for lambda to invoke sagemaker
15+
web_crawl_lambda_cfn_role = iam.Role(self, "Cfn-gen-ai-demo-web-crawler",
16+
assumed_by=iam.ServicePrincipal("lambda.amazonaws.com")
17+
)
18+
web_crawl_lambda_cfn_role.add_managed_policy(iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess"))
19+
web_crawl_lambda_cfn_role.add_to_policy(
20+
iam.PolicyStatement(
21+
actions=[
22+
"logs:CreateLogGroup",
23+
"logs:CreateLogStream",
24+
"logs:PutLogEvents"
25+
],
26+
resources=["*"]
27+
)
28+
)
29+
# Lambda function
30+
lambda_function= lambda_.DockerImageFunction(self, "web-crawler-docker-image-CFN",
31+
function_name="WebCrawlerLambda",
32+
code=lambda_.DockerImageCode.from_image_asset("web-crawler-docker-image"),
33+
role=web_crawl_lambda_cfn_role,
34+
memory_size=1024,
35+
timeout=Duration.minutes(5)
36+
)

src/lex-gen-ai-demo-cdk/lex_gen_ai_demo_cdk_files/lex_gen_ai_demo_cdk_files_stack.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,4 +127,4 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
127127
auto_build_bot_locales=True
128128
)
129129

130-
130+
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
FROM public.ecr.aws/lambda/python:3.8
2+
3+
COPY web_crawler_requirements.txt .
4+
RUN pip3 install -r web_crawler_requirements.txt --target "${LAMBDA_TASK_ROOT}"
5+
6+
# Copy function code
7+
COPY *.py ${LAMBDA_TASK_ROOT}
8+
9+
# Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
10+
CMD [ "web_crawler_app.handler" ]
11+
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import boto3
2+
import requests
3+
import html2text
4+
from typing import List
5+
import re
6+
import logging
7+
import json
8+
import traceback
9+
10+
logger = logging.getLogger()
11+
logger.setLevel(logging.INFO)
12+
13+
14+
def find_http_urls_in_parentheses(s: str, prefix: str = None):
15+
pattern = r'\((https?://[^)]+)\)'
16+
urls = re.findall(pattern, s)
17+
18+
matched = []
19+
if prefix is not None:
20+
for url in urls:
21+
if str(url).startswith(prefix):
22+
matched.append(url)
23+
else:
24+
matched = urls
25+
26+
return list(set(matched)) # remove duplicates by converting to set, then convert back to list
27+
28+
29+
30+
class EZWebLoader:
31+
32+
def __init__(self, default_header: str = None):
33+
self._html_to_text_parser = html2text
34+
if default_header is None:
35+
self._default_header = {"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36"}
36+
else:
37+
self._default_header = default_header
38+
39+
def load_data(self,
40+
urls: List[str],
41+
num_levels: int = 0,
42+
level_prefix: str = None,
43+
headers: str = None) -> List[str]:
44+
45+
logging.info(f"Number of urls: {len(urls)}.")
46+
47+
if headers is None:
48+
headers = self._default_header
49+
50+
documents = []
51+
visited = {}
52+
for url in urls:
53+
q = [url]
54+
depth = num_levels
55+
for page in q:
56+
if page not in visited: #prevent cycles by checking to see if we already crawled a link
57+
logging.info(f"Crawling {page}")
58+
visited[page] = True #add entry to visited to prevent re-crawling pages
59+
response = requests.get(page, headers=headers).text
60+
response = self._html_to_text_parser.html2text(response) #reduce html to text
61+
documents.append(response)
62+
if depth > 0:
63+
#crawl linked pages
64+
ingest_urls = find_http_urls_in_parentheses(response, level_prefix)
65+
logging.info(f"Found {len(ingest_urls)} pages to crawl.")
66+
q.extend(ingest_urls)
67+
depth -= 1 #reduce the depth counter so we go only num_levels deep in our crawl
68+
else:
69+
logging.info(f"Skipping {page} as it has already been crawled")
70+
logging.info(f"Number of documents: {len(documents)}.")
71+
return documents
72+
73+
ACCOUNT_ID = boto3.client('sts').get_caller_identity().get('Account')
74+
S3_BUCKET = "lexgenaistack-source-materials-bucket-" + ACCOUNT_ID
75+
FILE_NAME = 'web-crawl-results.txt'
76+
77+
78+
def handler(event, context):
79+
url = "http://www.zappos.com/general-questions"
80+
depth = 1
81+
level_prefix = "https://www.zappos.com/"
82+
83+
if event is not None:
84+
if "url" in event:
85+
url = event["url"]
86+
if "depth" in event:
87+
depth = int(event["depth"])
88+
if "level_prefix" in event:
89+
level_prefix = event["level_prefix"]
90+
91+
# crawl the website
92+
try:
93+
logger.info(f"Crawling {url} to depth of {depth}...")
94+
loader = EZWebLoader()
95+
documents = loader.load_data([url], depth, level_prefix)
96+
doc_string = json.dumps(documents, indent=1)
97+
logger.info(f"Crawling {url} to depth of {depth} succeeded")
98+
except Exception as e:
99+
# If there's an error, print the error message
100+
logging.error(f"An error occurred during the crawl of {url}.")
101+
exception_traceback = traceback.format_exc()
102+
logger.error(exception_traceback)
103+
return {
104+
"status": 500,
105+
"message": exception_traceback
106+
}
107+
# save the results for indexing
108+
try:
109+
# Use the S3 client to write the string to S3
110+
s3 = boto3.client('s3')
111+
s3.put_object(Body=doc_string, Bucket=S3_BUCKET, Key=FILE_NAME)
112+
success_msg = f'Successfully put {FILE_NAME} to {S3_BUCKET}'
113+
logging.info(success_msg)
114+
return {
115+
"status": 200,
116+
"message": success_msg
117+
}
118+
except Exception as e:
119+
# If there's an error, print the error message
120+
exception_traceback = traceback.format_exc()
121+
logger.error(exception_traceback)
122+
return {
123+
"status": 500,
124+
"message": exception_traceback
125+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
requests
2+
html2text
3+
accelerate
4+
boto3

src/lex-gen-ai-demo-cdk/web_crawl.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import boto3
2+
import argparse
3+
import json
4+
5+
6+
def invoke_lambda(url=None, depth="1", level_prefix=None):
7+
client = boto3.client('lambda')
8+
9+
# Prepare the payload
10+
payload = {}
11+
if url is not None:
12+
payload["url"] = url
13+
if depth is not None:
14+
payload["depth"] = depth
15+
if level_prefix is not None:
16+
payload["level_prefix"] = level_prefix
17+
18+
try:
19+
response = client.invoke(
20+
FunctionName='WebCrawlerLambda',
21+
InvocationType='RequestResponse',
22+
LogType='Tail',
23+
# The payload must be a JSON-formatted string
24+
Payload=json.dumps(payload)
25+
)
26+
27+
# The response from Lambda will be a JSON string, so you need to parse it
28+
result = response['Payload'].read().decode('utf-8')
29+
30+
print("Response: " + result)
31+
32+
except Exception as e:
33+
print(e)
34+
35+
36+
# Parse command-line arguments
37+
parser = argparse.ArgumentParser()
38+
parser.add_argument('--url', type=str, help='The URL to process.', required=False, default=None)
39+
parser.add_argument('--depth', type=int, help='The depth of the crawl.', required=False, default="1")
40+
parser.add_argument('--level_prefix', type=str, help='The prefix that any links must contain to crawl.', required=False, default=None)
41+
args = parser.parse_args()
42+
43+
invoke_lambda(args.url, args.depth, args.level_prefix)

0 commit comments

Comments
 (0)