Source code for datadiligence.evaluators.preprocess

"""
This module contains the PreprocessEvaluator class.
"""

from .base import Evaluator
from ..rules import SpawningAPI, BulkRule


[docs] class PreprocessEvaluator(Evaluator): """ Preprocess Evaluator class. Loads SpawningAPI rule by default. """ name = "preprocess" def __init__(self, user_agent=None): """ Load the default rules. Args: user_agent (str): The user agent to pass on to the rules. """ super().__init__() self.add_rule(SpawningAPI(user_agent))
[docs] def add_rule(self, rule): """Add a rule to the evaluator.""" if issubclass(rule.__class__, BulkRule): self.rules.append(rule)
[docs] def filter_allowed(self, urls=None, **kwargs): """Filter a list of urls based on the rules in this evaluator. Args: urls (list): A list of urls to filter. **kwargs: Arbitrary keyword arguments to read args from. Returns: list: A list of urls that are allowed. """ if urls is None: return [] allowed = urls for rule in self.rules: # if everything is already filtered out, stop if len(allowed) == 0: break if rule.is_ready(): allowed = rule.filter_allowed(urls=allowed, **kwargs) return allowed
[docs] def is_allowed(self, urls=None, **kwargs): """ Check if the urls are allowed. Args: urls (list): A list of urls to check. **kwargs: Arbitrary keyword arguments to read args from. Returns: bool: List of boolean values, respectively indicating if can be used or not """ if urls is None: return [] allowed = [True] * len(urls) for rule in self.rules: if rule.is_ready(): rule_results = rule.is_allowed(urls=urls, **kwargs) # update allowed list to False only if rule_results is False allowed = [a and b for a, b in zip(allowed, rule_results)] return allowed