Skip to content
This repository was archived by the owner on Aug 11, 2023. It is now read-only.

Commit c9135cb

Browse files
authored
Merge pull request #8 from wjh18/sitemap-enhancements
Sitemap enhancements
2 parents 6fce75c + c718a52 commit c9135cb

File tree

3 files changed

+73
-26
lines changed

3 files changed

+73
-26
lines changed

README.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,18 @@ To remove your API key from your default keystore, run `keyring del system psike
8484

8585
## Sitemap Support
8686

87-
Currently, only URLs to valid XML sitemaps are supported for reports that utilize sitemap format.
87+
Currently, only URLs to valid XML sitemaps are supported for reports that utilize sitemap format. Please see [sitemaps.org](https://sitemaps.org/protocol.html) for specification details.
8888

89-
Your web server or sitemap plugin must also allow robots to crawl your sitemap. If you see any permission errors that would be the first thing to check.
90-
91-
In the future, support for sitemap indices, multiple sitemaps and more advanced sitemap parsing will hopefully be added.
89+
Your web server or sitemap plugin must also allow robots to crawl your sitemap. If you see any permission errors that would be the first thing to check. Certain security solutions like CloudFlare also block crawlers so whitelisting the server you're running the package from may also be preferrable.
9290

9391
Your sitemap URL should be passed in as the positional argument for `url` when running `psi` from the command line.
9492

93+
### Sitemap Index
94+
95+
Support for sitemap index detection was recently added. This requires no additional action on your part. Simply pass your sitemap index in as the `url` argument via the cli.
96+
97+
If a sitemap index is detected, the package will recursively gather the URLs listed in each sitemap in your sitemap index and include them in requests. If a standard sitemap file is passed, only that sitemap will be processed.
98+
9599
## Command Line Arguments
96100

97101
If you've installed `pyspeedinsights` with `pip`, the default command to run cli commands is `psi`.

src/pyspeedinsights/app.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from .api.response import process_excel, process_json
55
from .cli import commands
66
from .core.excel import ExcelWorkbook
7-
from .core.sitemap import parse_sitemap, request_sitemap
7+
from .core.sitemap import process_sitemap, request_sitemap
88

99

1010
def main():
@@ -42,7 +42,7 @@ def main():
4242
# Create list of request URLs based on sitemap.
4343
sitemap_url = url
4444
sitemap = request_sitemap(sitemap_url)
45-
request_urls = parse_sitemap(sitemap)
45+
request_urls = process_sitemap(sitemap)
4646
request_urls = list(set(request_urls)) # Remove duplicates if they exist.
4747
else:
4848
# For analyzing a single page, only process the requested URL.

src/pyspeedinsights/core/sitemap.py

Lines changed: 63 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,26 @@
88

99

1010
def request_sitemap(url):
11-
"""Retrieve the sitemap from the URL provided in cmd args."""
11+
"""Retrieve the sitemap from the given URL"""
1212

1313
url = validate_url(url)
14-
15-
if validate_sitemap_url(url) is not True:
16-
err = "Invalid sitemap provided. Please provide a link to a valid XML sitemap."
14+
# Set a dummy user agent to avoid bot detection by firewalls
15+
# e.g. CloudFlare issues a 403 if it detects the default requests module user-agent
16+
dummy_user_agent = (
17+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
18+
"AppleWebKit/537.36 (KHTML, like Gecko) "
19+
"Chrome/104.0.5112.79 Safari/537.36"
20+
)
21+
headers = {"user-agent": dummy_user_agent}
22+
23+
if not validate_sitemap_url(url):
24+
err = (
25+
"Invalid sitemap URL provided. Please provide a URL to a valid XML sitemap."
26+
)
1727
raise SystemExit(err)
1828
try:
1929
print(f"Requesting sitemap... ({url})")
20-
resp = requests.get(url)
30+
resp = requests.get(url, headers=headers)
2131
resp.raise_for_status()
2232
except requests.exceptions.HTTPError as errh:
2333
raise SystemExit(errh)
@@ -34,26 +44,59 @@ def request_sitemap(url):
3444
return sitemap
3545

3646

37-
def parse_sitemap(sitemap):
38-
"""Parse URLs from the XML sitemap and return a list of URLs."""
47+
def validate_sitemap_url(url):
48+
"""Validate that the sitemap URL is valid (.xml format)."""
3949

40-
print("Parsing URLs from sitemap...")
50+
u = urlsplit(url)
51+
ext = splitext(u.path)[1]
52+
return ext == ".xml"
53+
54+
55+
def process_sitemap(sitemap):
56+
"""
57+
Process an individual sitemap or recursively process multiple sitemaps
58+
via a sitemap index and return a full list of request URLs.
59+
"""
4160

4261
root = ET.fromstring(sitemap)
43-
namespace = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
62+
sitemap_type = root.tag.split("}")[-1]
4463

45-
urls = []
46-
for url in root.findall(f"{namespace}url"):
47-
loc = url.find(f"{namespace}loc")
48-
urls.append(loc.text)
64+
if sitemap_type == "sitemapindex":
65+
request_urls = []
66+
sitemap_urls = _parse_sitemap_index(root)
4967

50-
return urls
68+
for sm_url in sitemap_urls:
69+
sitemap = request_sitemap(sm_url)
70+
request_urls.extend(process_sitemap(sitemap))
5171

72+
elif sitemap_type == "urlset":
73+
request_urls = _parse_sitemap_urls(root)
5274

53-
def validate_sitemap_url(url):
54-
"""Validate that the sitemap URL is valid (.xml format)."""
75+
return request_urls
5576

56-
u = urlsplit(url)
57-
ext = splitext(u.path)[1]
58-
if ext == ".xml":
59-
return True
77+
78+
def _parse_sitemap_index(root):
79+
"""Parse sitemap URLs from the sitemap index and return them as a list."""
80+
81+
print("Sitemap index found. Parsing sitemap URLs...")
82+
return _parse_urls_from_root(root, type="sitemap")
83+
84+
85+
def _parse_sitemap_urls(root):
86+
"""Parse URLs from the XML sitemap and return a list of request URLs."""
87+
88+
print("Parsing URLs from sitemap...")
89+
return _parse_urls_from_root(root)
90+
91+
92+
def _parse_urls_from_root(root, type="url"):
93+
"""Parse URL locs from root xml element"""
94+
95+
namespace = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
96+
urls = []
97+
98+
for el in root.findall(f"{namespace}{type}"):
99+
loc = el.find(f"{namespace}loc")
100+
urls.append(loc.text)
101+
102+
return urls

0 commit comments

Comments
 (0)