Source code for usp.tree

"""Helpers to generate a sitemap tree."""

from typing import Optional

from .exceptions import SitemapException
from .fetch_parse import SitemapFetcher
from .helpers import is_http_url, strip_url_to_homepage
from .log import create_logger
from .objects.sitemap import AbstractSitemap, InvalidSitemap, IndexWebsiteSitemap, IndexRobotsTxtSitemap
from .web_client.abstract_client import AbstractWebClient

log = create_logger(__name__)

_UNPUBLISHED_SITEMAP_PATHS = {
    'sitemap.xml',
    'sitemap.xml.gz',
    'sitemap_index.xml',
    'sitemap-index.xml',
    'sitemap_index.xml.gz',
    'sitemap-index.xml.gz',
    '.sitemap.xml',
    'sitemap',
    'admin/config/search/xmlsitemap',
    'sitemap/sitemap-index.xml',
}
"""Paths which are not exposed in robots.txt but might still contain a sitemap."""


[docs]def sitemap_tree_for_homepage(homepage_url: str, web_client: Optional[AbstractWebClient] = None) -> AbstractSitemap:
    """
    Using a homepage URL, fetch the tree of sitemaps and pages listed in them.

    :param homepage_url: Homepage URL of a website to fetch the sitemap tree for, e.g. "http://www.example.com/".
    :param web_client: Web client implementation to use for fetching sitemaps.
    :return: Root sitemap object of the fetched sitemap tree.
    """

    if not is_http_url(homepage_url):
        raise SitemapException("URL {} is not a HTTP(s) URL.".format(homepage_url))

    stripped_homepage_url = strip_url_to_homepage(url=homepage_url)
    if homepage_url != stripped_homepage_url:
        log.warning("Assuming that the homepage of {} is {}".format(homepage_url, stripped_homepage_url))
        homepage_url = stripped_homepage_url

    if not homepage_url.endswith('/'):
        homepage_url += '/'
    robots_txt_url = homepage_url + 'robots.txt'

    sitemaps = []

    robots_txt_fetcher = SitemapFetcher(url=robots_txt_url, web_client=web_client, recursion_level=0)
    robots_txt_sitemap = robots_txt_fetcher.sitemap()
    sitemaps.append(robots_txt_sitemap)

    sitemap_urls_found_in_robots_txt = set()
    if isinstance(robots_txt_sitemap, IndexRobotsTxtSitemap):
        for sub_sitemap in robots_txt_sitemap.sub_sitemaps:
            sitemap_urls_found_in_robots_txt.add(sub_sitemap.url)

    for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS:
        unpublished_sitemap_url = homepage_url + unpublished_sitemap_path

        # Don't refetch URLs already found in robots.txt
        if unpublished_sitemap_url not in sitemap_urls_found_in_robots_txt:

            unpublished_sitemap_fetcher = SitemapFetcher(
                url=unpublished_sitemap_url,
                web_client=web_client,
                recursion_level=0,
            )
            unpublished_sitemap = unpublished_sitemap_fetcher.sitemap()

            # Skip the ones that weren't found
            if not isinstance(unpublished_sitemap, InvalidSitemap):
                sitemaps.append(unpublished_sitemap)

    index_sitemap = IndexWebsiteSitemap(url=homepage_url, sub_sitemaps=sitemaps)

    return index_sitemap
Source code for usp.tree

Ultimate Sitemap Parser

Navigation

Related Topics