Source code for usp.objects.sitemap

"""Objects that represent one of the found sitemaps."""

import abc
import os
import pickle
import tempfile
from typing import List, Iterator

from .page import SitemapPage

[docs]class AbstractSitemap(object, metaclass=abc.ABCMeta): """ Abstract sitemap. """ __slots__ = [ '__url', ] def __init__(self, url: str): """ Initialize a new sitemap. :param url: Sitemap URL. """ self.__url = url def __eq__(self, other) -> bool: if not isinstance(other, AbstractSitemap): raise NotImplemented if self.url != other.url: return False return True def __hash__(self): return hash(( self.url, )) def __repr__(self): return ( "{self.__class__.__name__}(" "url={self.url}" ")" ).format(self=self) @property def url(self) -> str: """ Return sitemap URL. :return: Sitemap URL. """ return self.__url
[docs] @abc.abstractmethod def all_pages(self) -> Iterator[SitemapPage]: """ Return iterator which yields all pages of this sitemap and linked sitemaps (if any). :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). """ raise NotImplementedError("Abstract method")
[docs]class InvalidSitemap(AbstractSitemap): """Invalid sitemap, e.g. the one that can't be parsed.""" __slots__ = [ '__reason', ] def __init__(self, url: str, reason: str): """ Initialize a new invalid sitemap. :param url: Sitemap URL. :param reason: Reason why the sitemap is deemed invalid. """ super().__init__(url=url) self.__reason = reason def __eq__(self, other) -> bool: if not isinstance(other, InvalidSitemap): raise NotImplemented if self.url != other.url: return False if self.reason != other.reason: return False return True def __repr__(self): return ( "{self.__class__.__name__}(" "url={self.url}, " "reason={self.reason}" ")" ).format(self=self) @property def reason(self) -> str: """ Return reason why the sitemap is deemed invalid. :return: Reason why the sitemap is deemed invalid. """ return self.__reason
[docs] def all_pages(self) -> Iterator[SitemapPage]: """ Return iterator which yields all pages of this sitemap and linked sitemaps (if any). :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). """ yield from []
[docs]class AbstractPagesSitemap(AbstractSitemap, metaclass=abc.ABCMeta): """Abstract sitemap that contains URLs to pages.""" __slots__ = [ '__pages_temp_file_path', ] def __init__(self, url: str, pages: List[SitemapPage]): """ Initialize new pages sitemap. :param url: Sitemap URL. :param pages: List of pages found in a sitemap. """ super().__init__(url=url) temp_file, self.__pages_temp_file_path = tempfile.mkstemp() with os.fdopen(temp_file, 'wb') as tmp: pickle.dump(pages, tmp, protocol=pickle.HIGHEST_PROTOCOL) def __del__(self): os.unlink(self.__pages_temp_file_path) def __eq__(self, other) -> bool: if not isinstance(other, AbstractPagesSitemap): raise NotImplemented if self.url != other.url: return False if self.pages != other.pages: return False return True def __repr__(self): return ( "{self.__class__.__name__}(" "url={self.url}, " "pages={self.pages}" ")" ).format(self=self) @property def pages(self) -> List[SitemapPage]: """ Return list of pages found in a sitemap. :return: List of pages found in a sitemap. """ with open(self.__pages_temp_file_path, 'rb') as tmp: pages = pickle.load(tmp) return pages
[docs] def all_pages(self) -> Iterator[SitemapPage]: """ Return iterator which yields all pages of this sitemap and linked sitemaps (if any). :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). """ for page in self.pages: yield page
[docs]class PagesXMLSitemap(AbstractPagesSitemap): """ XML sitemap that contains URLs to pages. """ pass
[docs]class PagesTextSitemap(AbstractPagesSitemap): """ Plain text sitemap that contains URLs to pages. """ pass
[docs]class PagesRSSSitemap(AbstractPagesSitemap): """ RSS 2.0 sitemap that contains URLs to pages. """ pass
[docs]class PagesAtomSitemap(AbstractPagesSitemap): """ RSS 0.3 / 1.0 sitemap that contains URLs to pages. """ pass
[docs]class AbstractIndexSitemap(AbstractSitemap): """ Abstract sitemap with URLs to other sitemaps. """ __slots__ = [ '__sub_sitemaps', ] def __init__(self, url: str, sub_sitemaps: List[AbstractSitemap]): """ Initialize index sitemap. :param url: Sitemap URL. :param sub_sitemaps: Sub-sitemaps that are linked to from this sitemap. """ super().__init__(url=url) self.__sub_sitemaps = sub_sitemaps def __eq__(self, other) -> bool: if not isinstance(other, AbstractIndexSitemap): raise NotImplemented if self.url != other.url: return False if self.sub_sitemaps != other.sub_sitemaps: return False return True def __repr__(self): return ( "{self.__class__.__name__}(" "url={self.url}, " "sub_sitemaps={self.sub_sitemaps}" ")" ).format(self=self) @property def sub_sitemaps(self) -> List[AbstractSitemap]: """ Return sub-sitemaps that are linked to from this sitemap. :return: Sub-sitemaps that are linked to from this sitemap. """ return self.__sub_sitemaps
[docs] def all_pages(self) -> Iterator[SitemapPage]: """ Return iterator which yields all pages of this sitemap and linked sitemaps (if any). :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any). """ for sub_sitemap in self.sub_sitemaps: for page in sub_sitemap.all_pages(): yield page
[docs]class IndexWebsiteSitemap(AbstractIndexSitemap): """ Website's root sitemaps, including robots.txt and extra ones. """ pass
[docs]class IndexXMLSitemap(AbstractIndexSitemap): """ XML sitemap with URLs to other sitemaps. """ pass
[docs]class IndexRobotsTxtSitemap(AbstractIndexSitemap): """ robots.txt sitemap with URLs to other sitemaps. """ pass