PacktPublishing/LLM-Engineers-Handbook

Codebase is not pythonic

Closed this issue · 0 comments

Dispatcher.py is unnecessarily using builder method and is not pythonic. It is inflating number of lines.

class CrawlerDispatcher:
    def __init__(self) -> None:
        self._crawlers = {}

    @classmethod
    def build(cls) -> "CrawlerDispatcher":
        dispatcher = cls()

        return dispatcher

    def register_medium(self) -> "CrawlerDispatcher":
        self.register("https://medium.com", MediumCrawler)

        return self

    def register_linkedin(self) -> "CrawlerDispatcher":
        self.register("https://linkedin.com", LinkedInCrawler)

        return self

    def register_github(self) -> "CrawlerDispatcher":
        self.register("https://github.com", GithubCrawler)

        return self

    def register(self, domain: str, crawler: type[BaseCrawler]) -> None:
        parsed_domain = urlparse(domain)
        domain = parsed_domain.netloc

        self._crawlers[r"https://(www\.)?{}/*".format(re.escape(domain))] = crawler

    def get_crawler(self, url: str) -> BaseCrawler:
        for pattern, crawler in self._crawlers.items():
            if re.match(pattern, url):
                return crawler()
        else:
            logger.warning(f"No crawler found for {url}. Defaulting to CustomArticleCrawler.")

            return CustomArticleCrawler()

Instead of

class CrawlerDispatcher:
    def __init__(self):
        self._crawlers = {}
        self._register_default_crawlers()

    def _register_default_crawlers(self):
        self.register("https://medium.com", MediumCrawler)
        self.register("https://linkedin.com", LinkedInCrawler)
        self.register("https://github.com", GithubCrawler)

    def register(self, domain: str, crawler: type[BaseCrawler]) -> None:
        parsed_domain = urlparse(domain)
        domain = parsed_domain.netloc
        self._crawlers[r"https://(www\.)?{}/*".format(re.escape(domain))] = crawler

    def get_crawler(self, url: str) -> BaseCrawler:
        for pattern, crawler in self._crawlers.items():
            if re.match(pattern, url):
                return crawler()
        logger.warning(f"No crawler found for {url}. Defaulting to CustomArticleCrawler.")
        return CustomArticleCrawler()

Similarly, python does not need to use abs class similar to java. It has protocols.
https://peps.python.org/pep-0544/#using-protocols