Merge pull request #8 from wjh18/sitemap-enhancements

wjh18 · web-flow · commit c9135cb3db5b · 2022-10-28T16:18:13.000-04:00
Sitemap enhancements
diff --git a/README.md b/README.md
@@ -84,14 +84,18 @@ To remove your API key from your default keystore, run `keyring del system psike
 
 ## Sitemap Support
 
-Currently, only URLs to valid XML sitemaps are supported for reports that utilize sitemap format.
+Currently, only URLs to valid XML sitemaps are supported for reports that utilize sitemap format. Please see [sitemaps.org](https://sitemaps.org/protocol.html) for specification details.
 
-Your web server or sitemap plugin must also allow robots to crawl your sitemap. If you see any permission errors that would be the first thing to check.
-
-In the future, support for sitemap indices, multiple sitemaps and more advanced sitemap parsing will hopefully be added.
+Your web server or sitemap plugin must also allow robots to crawl your sitemap. If you see any permission errors that would be the first thing to check. Certain security solutions like CloudFlare also block crawlers so whitelisting the server you're running the package from may also be preferrable.
 
 Your sitemap URL should be passed in as the positional argument for `url` when running `psi` from the command line.
 
+### Sitemap Index
+
+Support for sitemap index detection was recently added. This requires no additional action on your part. Simply pass your sitemap index in as the `url` argument via the cli.
+
+If a sitemap index is detected, the package will recursively gather the URLs listed in each sitemap in your sitemap index and include them in requests. If a standard sitemap file is passed, only that sitemap will be processed.
+
 ## Command Line Arguments
 
 If you've installed `pyspeedinsights` with `pip`, the default command to run cli commands is `psi`.
diff --git a/src/pyspeedinsights/app.py b/src/pyspeedinsights/app.py
@@ -4,7 +4,7 @@
 from .api.response import process_excel, process_json
 from .cli import commands
 from .core.excel import ExcelWorkbook
-from .core.sitemap import parse_sitemap, request_sitemap
+from .core.sitemap import process_sitemap, request_sitemap
 
 
 def main():
@@ -42,7 +42,7 @@ def main():
         # Create list of request URLs based on sitemap.
         sitemap_url = url
         sitemap = request_sitemap(sitemap_url)
-        request_urls = parse_sitemap(sitemap)
+        request_urls = process_sitemap(sitemap)
         request_urls = list(set(request_urls))  # Remove duplicates if they exist.
     else:
         # For analyzing a single page, only process the requested URL.
diff --git a/src/pyspeedinsights/core/sitemap.py b/src/pyspeedinsights/core/sitemap.py
@@ -8,16 +8,26 @@
 
 
 def request_sitemap(url):
-    """Retrieve the sitemap from the URL provided in cmd args."""
+    """Retrieve the sitemap from the given URL"""
 
     url = validate_url(url)
-
-    if validate_sitemap_url(url) is not True:
-        err = "Invalid sitemap provided. Please provide a link to a valid XML sitemap."
+    # Set a dummy user agent to avoid bot detection by firewalls
+    # e.g. CloudFlare issues a 403 if it detects the default requests module user-agent
+    dummy_user_agent = (
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/104.0.5112.79 Safari/537.36"
+    )
+    headers = {"user-agent": dummy_user_agent}
+
+    if not validate_sitemap_url(url):
+        err = (
+            "Invalid sitemap URL provided. Please provide a URL to a valid XML sitemap."
+        )
         raise SystemExit(err)
     try:
         print(f"Requesting sitemap... ({url})")
-        resp = requests.get(url)
+        resp = requests.get(url, headers=headers)
         resp.raise_for_status()
     except requests.exceptions.HTTPError as errh:
         raise SystemExit(errh)
@@ -34,26 +44,59 @@ def request_sitemap(url):
     return sitemap
 
 
-def parse_sitemap(sitemap):
-    """Parse URLs from the XML sitemap and return a list of URLs."""
+def validate_sitemap_url(url):
+    """Validate that the sitemap URL is valid (.xml format)."""
 
-    print("Parsing URLs from sitemap...")
+    u = urlsplit(url)
+    ext = splitext(u.path)[1]
+    return ext == ".xml"
+
+
+def process_sitemap(sitemap):
+    """
+    Process an individual sitemap or recursively process multiple sitemaps
+    via a sitemap index and return a full list of request URLs.
+    """
 
     root = ET.fromstring(sitemap)
-    namespace = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
+    sitemap_type = root.tag.split("}")[-1]
 
-    urls = []
-    for url in root.findall(f"{namespace}url"):
-        loc = url.find(f"{namespace}loc")
-        urls.append(loc.text)
+    if sitemap_type == "sitemapindex":
+        request_urls = []
+        sitemap_urls = _parse_sitemap_index(root)
 
-    return urls
+        for sm_url in sitemap_urls:
+            sitemap = request_sitemap(sm_url)
+            request_urls.extend(process_sitemap(sitemap))
 
+    elif sitemap_type == "urlset":
+        request_urls = _parse_sitemap_urls(root)
 
-def validate_sitemap_url(url):
-    """Validate that the sitemap URL is valid (.xml format)."""
+    return request_urls
 
-    u = urlsplit(url)
-    ext = splitext(u.path)[1]
-    if ext == ".xml":
-        return True
+
+def _parse_sitemap_index(root):
+    """Parse sitemap URLs from the sitemap index and return them as a list."""
+
+    print("Sitemap index found. Parsing sitemap URLs...")
+    return _parse_urls_from_root(root, type="sitemap")
+
+
+def _parse_sitemap_urls(root):
+    """Parse URLs from the XML sitemap and return a list of request URLs."""
+
+    print("Parsing URLs from sitemap...")
+    return _parse_urls_from_root(root)
+
+
+def _parse_urls_from_root(root, type="url"):
+    """Parse URL locs from root xml element"""
+
+    namespace = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
+    urls = []
+
+    for el in root.findall(f"{namespace}{type}"):
+        loc = el.find(f"{namespace}loc")
+        urls.append(loc.text)
+
+    return urls