libs/core/langchain_core/document_loaders/base.py · langchain-ai/langchain

1"""Abstract interface for document loader implementations."""23from __future__ import annotations45from abc import ABC, abstractmethod6from typing import TYPE_CHECKING78from langchain_core.runnables import run_in_executor910if TYPE_CHECKING:11    from collections.abc import AsyncIterator, Iterator1213    from langchain_text_splitters import TextSplitter1415    from langchain_core.documents import Document16    from langchain_core.documents.base import Blob1718try:19    from langchain_text_splitters import RecursiveCharacterTextSplitter2021    _HAS_TEXT_SPLITTERS = True22except ImportError:23    _HAS_TEXT_SPLITTERS = False242526class BaseLoader(ABC):  # noqa: B02427    """Interface for document loader.2829    Implementations should implement the lazy-loading method using generators to avoid30    loading all documents into memory at once.3132    `load` is provided just for user convenience and should not be overridden.33    """3435    # Sub-classes should not implement this method directly. Instead, they36    # should implement the lazy load method.37    def load(self) -> list[Document]:38        """Load data into `Document` objects.3940        Returns:41            The documents.42        """43        return list(self.lazy_load())4445    async def aload(self) -> list[Document]:46        """Load data into `Document` objects.4748        Returns:49            The documents.50        """51        return [document async for document in self.alazy_load()]5253    def load_and_split(54        self, text_splitter: TextSplitter | None = None55    ) -> list[Document]:56        """Load `Document` and split into chunks. Chunks are returned as `Document`.5758        !!! danger5960            Do not override this method. It should be considered to be deprecated!6162        Args:63            text_splitter: `TextSplitter` instance to use for splitting documents.6465                Defaults to `RecursiveCharacterTextSplitter`.6667        Raises:68            ImportError: If `langchain-text-splitters` is not installed and no69                `text_splitter` is provided.7071        Returns:72            List of `Document` objects.73        """74        if text_splitter is None:75            if not _HAS_TEXT_SPLITTERS:76                msg = (77                    "Unable to import from langchain_text_splitters. Please specify "78                    "text_splitter or install langchain_text_splitters with "79                    "`pip install -U langchain-text-splitters`."80                )81                raise ImportError(msg)8283            text_splitter_: TextSplitter = RecursiveCharacterTextSplitter()84        else:85            text_splitter_ = text_splitter86        docs = self.load()87        return text_splitter_.split_documents(docs)8889    # Attention: This method will be upgraded into an abstractmethod once it's90    #            implemented in all the existing subclasses.91    def lazy_load(self) -> Iterator[Document]:92        """A lazy loader for `Document`.9394        Yields:95            The `Document` objects.96        """97        if type(self).load != BaseLoader.load:98            return iter(self.load())99        msg = f"{self.__class__.__name__} does not implement lazy_load()"100        raise NotImplementedError(msg)101102    async def alazy_load(self) -> AsyncIterator[Document]:103        """A lazy loader for `Document`.104105        Yields:106            The `Document` objects.107        """108        iterator = await run_in_executor(None, self.lazy_load)109        done = object()110        while True:111            doc = await run_in_executor(None, next, iterator, done)112            if doc is done:113                break114            yield doc  # type: ignore[misc]115116117class BaseBlobParser(ABC):118    """Abstract interface for blob parsers.119120    A blob parser provides a way to parse raw data stored in a blob into one or more121    `Document` objects.122123    The parser can be composed with blob loaders, making it easy to reuse a parser124    independent of how the blob was originally loaded.125    """126127    @abstractmethod128    def lazy_parse(self, blob: Blob) -> Iterator[Document]:129        """Lazy parsing interface.130131        Subclasses are required to implement this method.132133        Args:134            blob: `Blob` instance135136        Returns:137            Generator of `Document` objects138        """139140    def parse(self, blob: Blob) -> list[Document]:141        """Eagerly parse the blob into a `Document` or list of `Document` objects.142143        This is a convenience method for interactive development environment.144145        Production applications should favor the `lazy_parse` method instead.146147        Subclasses should generally not over-ride this parse method.148149        Args:150            blob: `Blob` instance151152        Returns:153            List of `Document` objects154        """155        return list(self.lazy_parse(blob))

Code quality findings 4

Avoid unnecessary list conversions; use generators where possible

L43

unnecessary-list

return list(self.lazy_load())

Ensure functions have docstrings for documentation

L53

missing-docstring

def load_and_split(

Use isinstance() for type checking instead of type()

L97

type-check

if type(self).load != BaseLoader.load:

Avoid unnecessary list conversions; use generators where possible

L155

unnecessary-list

return list(self.lazy_parse(blob))

Code quality findings 4

Get this view in your editor