Source code for datadiligence.rules.http

"""
Rules to manage validation using HTTP properties
"""

from ..exceptions import XRobotsTagNoParam, TDMRepNoParam
from .base import HttpRule


[docs] class XRobotsTagHeader(HttpRule): """ This class wraps logic to read the X-Robots-Tag header. """ AI_DISALLOWED_VALUES = ["noai", "noimageai"] INDEX_DISALLOWED_VALUES = ["noindex", "none", "noimageindex", "noai", "noimageai"] HEADER_NAME = "X-Robots-Tag" def __init__(self, user_agent=None, respect_noindex=False): """Create a new XRobotsTagHeader instance. Args: user_agent (str): The user agent to use when making requests to the Spawning AI API. respect_noindex (bool): If True, index rules will be respected alongside AI rules. """ super().__init__(user_agent=user_agent) # index rules aren't for AI, so we ignore them by default. # They could have been delivered/found by any number of other means, even for internal use if respect_noindex: self.disallowed_headers = self.INDEX_DISALLOWED_VALUES else: self.disallowed_headers = self.AI_DISALLOWED_VALUES
[docs] def is_allowed(self, url=None, response=None, headers=None, **kwargs): """Check if the X-Robots-Tag header allows the user agent to access the resource. Args: url: (str): The URL of the resource. response (http.client.HTTPResponse|requests.Response, optional): The response object. Defaults to None headers (dict|http.client.HTTPMessage, optional): The headers dictionary. Defaults to None. Returns: bool: True if the user agent is allowed to access the resource, False otherwise. """ if headers: header_value = self.get_header_value(headers, self.HEADER_NAME) elif response: header_value = self.get_header_value_from_response(response, self.HEADER_NAME) elif url: response = self._handle_url(url) header_value = self.get_header_value(response.headers, self.HEADER_NAME) else: raise XRobotsTagNoParam() return self._eval_header_value(header_value, **kwargs)
def _eval_header_value(self, header_value, user_agent=None, **kwargs): """ Evaluate the header value to determine if the user agent is allowed to access the resource. Args: header_value (str): The header value. user_agent (str): Override user agent to use when making requests to the Spawning AI API. Returns: bool: True if the user agent is allowed to access the resource, False otherwise. """ if not header_value: return True # if we have a specific user agent if not user_agent: user_agent = self.user_agent # check if blocking all user agents for value in header_value.split(","): if value.strip() in self.disallowed_headers: return False # check if blocking specific user agent if user_agent: ua_values = value.split(":") if len(ua_values) == 2 and ua_values[0].strip() == user_agent \ and ua_values[1].strip() in self.disallowed_headers: return False return True
[docs] class TDMRepHeader(HttpRule): """ This class wraps logic to evaluate the TDM Reservation Protocol headers: https://www.w3.org/2022/tdmrep/. """ HEADER_NAME = "tdm-reservation" def __init__(self): """Create a new TDMRepHeaders instance.""" super().__init__()
[docs] def is_allowed(self, url=None, response=None, headers=None, **kwargs): """Check if the tdm-rep header allows access to the resource without a policy. Args: url: (str): The URL of the resource. response (http.client.HTTPResponse|requests.Response, optional): The response object. Defaults to None headers (dict|http.client.HTTPMessage, optional): The headers dictionary. Defaults to None. Returns: bool: True if access is allowed for the resource, False otherwise. """ if headers: header_value = self.get_header_value(headers, self.HEADER_NAME) elif response: header_value = self.get_header_value_from_response(response, self.HEADER_NAME) elif url: response = self._handle_url(url) header_value = self.get_header_value(response.headers, self.HEADER_NAME) else: raise TDMRepNoParam() return self._eval_header_value(header_value, **kwargs)
def _eval_header_value(self, header_value, **kwargs): """ Evaluate the header value to determine if the resource permits anonymous access. Args: header_value (str): The header value. Returns: bool: True if resource allows access without a policy, False otherwise. """ if not header_value: return True return header_value.strip() != "1"