Source code for usp.web_client.requests_client

"""requests-based implementation of web client class."""

from http import HTTPStatus
from typing import Optional, Dict

import requests

from .abstract_client import (
    AbstractWebClient,
    AbstractWebClientResponse,
    AbstractWebClientSuccessResponse,
    WebClientErrorResponse,
    RETRYABLE_HTTP_STATUS_CODES,
)
from usp.__about__ import __version__


[docs]class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse): """ requests-based successful response. """ __slots__ = [ '__requests_response', '__max_response_data_length', ] def __init__(self, requests_response: requests.Response, max_response_data_length: Optional[int] = None): self.__requests_response = requests_response self.__max_response_data_length = max_response_data_length
[docs] def status_code(self) -> int: return int(self.__requests_response.status_code)
[docs] def status_message(self) -> str: message = self.__requests_response.reason if not message: message = HTTPStatus(self.status_code(), None).phrase return message
[docs] def header(self, case_insensitive_name: str) -> Optional[str]: return self.__requests_response.headers.get(case_insensitive_name.lower(), None)
[docs] def raw_data(self) -> bytes: if self.__max_response_data_length: data = self.__requests_response.content[:self.__max_response_data_length] else: data = self.__requests_response.content return data
[docs]class RequestsWebClientErrorResponse(WebClientErrorResponse): """ requests-based error response. """ pass
[docs]class RequestsWebClient(AbstractWebClient): """requests-based web client to be used by the sitemap fetcher.""" __USER_AGENT = 'ultimate_sitemap_parser/{}'.format(__version__) __HTTP_REQUEST_TIMEOUT = 60 """ HTTP request timeout. Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big. """ __slots__ = [ '__max_response_data_length', '__timeout', '__proxies', ] def __init__(self): self.__max_response_data_length = None self.__timeout = self.__HTTP_REQUEST_TIMEOUT self.__proxies = {}
[docs] def set_timeout(self, timeout: int) -> None: """Set HTTP request timeout.""" # Used mostly for testing self.__timeout = timeout
[docs] def set_proxies(self, proxies: Dict[str, str]) -> None: """ Set proxies from dictionnary where: * keys are schemes, e.g. "http" or "https"; * values are "scheme://user:password@host:port/". For example: proxies = {'http': 'http://user:pass@10.10.1.10:3128/'} """ # Used mostly for testing self.__proxies = proxies
[docs] def set_max_response_data_length(self, max_response_data_length: int) -> None: self.__max_response_data_length = max_response_data_length
[docs] def get(self, url: str) -> AbstractWebClientResponse: try: response = requests.get( url, timeout=self.__timeout, stream=True, headers={'User-Agent': self.__USER_AGENT}, proxies=self.__proxies ) except requests.exceptions.Timeout as ex: # Retryable timeouts return RequestsWebClientErrorResponse(message=str(ex), retryable=True) except requests.exceptions.RequestException as ex: # Other errors, e.g. redirect loops return RequestsWebClientErrorResponse(message=str(ex), retryable=False) else: if 200 <= response.status_code < 300: return RequestsWebClientSuccessResponse( requests_response=response, max_response_data_length=self.__max_response_data_length, ) else: message = '{} {}'.format(response.status_code, response.reason) if response.status_code in RETRYABLE_HTTP_STATUS_CODES: return RequestsWebClientErrorResponse(message=message, retryable=True) else: return RequestsWebClientErrorResponse(message=message, retryable=False)