Source code for usp.web_client.requests_client
"""requests-based implementation of web client class."""
from http import HTTPStatus
from typing import Optional, Dict
import requests
from .abstract_client import (
AbstractWebClient,
AbstractWebClientResponse,
AbstractWebClientSuccessResponse,
WebClientErrorResponse,
RETRYABLE_HTTP_STATUS_CODES,
)
from usp.__about__ import __version__
[docs]class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse):
"""
requests-based successful response.
"""
__slots__ = [
'__requests_response',
'__max_response_data_length',
]
def __init__(self, requests_response: requests.Response, max_response_data_length: Optional[int] = None):
self.__requests_response = requests_response
self.__max_response_data_length = max_response_data_length
[docs] def status_code(self) -> int:
return int(self.__requests_response.status_code)
[docs] def status_message(self) -> str:
message = self.__requests_response.reason
if not message:
message = HTTPStatus(self.status_code(), None).phrase
return message
[docs] def raw_data(self) -> bytes:
if self.__max_response_data_length:
data = self.__requests_response.content[:self.__max_response_data_length]
else:
data = self.__requests_response.content
return data
[docs]class RequestsWebClientErrorResponse(WebClientErrorResponse):
"""
requests-based error response.
"""
pass
[docs]class RequestsWebClient(AbstractWebClient):
"""requests-based web client to be used by the sitemap fetcher."""
__USER_AGENT = 'ultimate_sitemap_parser/{}'.format(__version__)
__HTTP_REQUEST_TIMEOUT = 60
"""
HTTP request timeout.
Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big.
"""
__slots__ = [
'__max_response_data_length',
'__timeout',
'__proxies',
]
def __init__(self):
self.__max_response_data_length = None
self.__timeout = self.__HTTP_REQUEST_TIMEOUT
self.__proxies = {}
[docs] def set_timeout(self, timeout: int) -> None:
"""Set HTTP request timeout."""
# Used mostly for testing
self.__timeout = timeout
[docs] def set_proxies(self, proxies: Dict[str, str]) -> None:
"""
Set proxies from dictionnary where:
* keys are schemes, e.g. "http" or "https";
* values are "scheme://user:password@host:port/".
For example:
proxies = {'http': 'http://user:pass@10.10.1.10:3128/'}
"""
# Used mostly for testing
self.__proxies = proxies
[docs] def set_max_response_data_length(self, max_response_data_length: int) -> None:
self.__max_response_data_length = max_response_data_length
[docs] def get(self, url: str) -> AbstractWebClientResponse:
try:
response = requests.get(
url,
timeout=self.__timeout,
stream=True,
headers={'User-Agent': self.__USER_AGENT},
proxies=self.__proxies
)
except requests.exceptions.Timeout as ex:
# Retryable timeouts
return RequestsWebClientErrorResponse(message=str(ex), retryable=True)
except requests.exceptions.RequestException as ex:
# Other errors, e.g. redirect loops
return RequestsWebClientErrorResponse(message=str(ex), retryable=False)
else:
if 200 <= response.status_code < 300:
return RequestsWebClientSuccessResponse(
requests_response=response,
max_response_data_length=self.__max_response_data_length,
)
else:
message = '{} {}'.format(response.status_code, response.reason)
if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
return RequestsWebClientErrorResponse(message=message, retryable=True)
else:
return RequestsWebClientErrorResponse(message=message, retryable=False)