Skip to content

Index

BaseMetadata

Bases: BaseModel, MutableMapping, Generic[TMetadataOwner]

The base metadata class is utilized for defining a basic yet flexible interface for metadata attached to various fields.

When used out of the box, the metadata class will adobpt dictionary-like behavior. You may easily access different fields of the metadata as if it were a dictionary:

# Instantiate it with any kwargs you like
metadata = BaseMetadata(foo-'bar', cow='moo')

metadata["foo"]  # "bar"
metadata["cow"]  # "moo"

# Update the value of the key
metadata["foo"] = "fighters"

# Set new key-value pairs
metadata['sheep'] = 'baa'

Otherwise, you may sub-class the metadata class in order to create a more strictly typed metadata model. This is useful when you want to enforce a specific structure for your metadata.

class CustomMetadata(BaseMetadata):
    foo: str
    cow: str

# Instantiate it with the required fields
metadata = CustomMetadata(foo='bar', cow='moo')

metadata.foo  # "bar"
metadata.cow  # "moo"

# Update the value of the key
metadata.foo = "fighters"

# Use the extra field to store dynamic metadata
metadata.extra['sheep'] = 'baa'

Additionally, the task results descriptor allows for controlled and easy access to the task results of various tasks that are run on the parent node.

Source code in docprompt/schema/pipeline/metadata.py
class BaseMetadata(BaseModel, MutableMapping, Generic[TMetadataOwner]):
    """
    The base metadata class is utilized for defining a basic yet flexible interface
    for metadata attached to various fields.

    The metadata class can be used in two ways:
        1. As a dictionary-like object, where metadata is stored in the `extra` field.
        2. As a sub-classed model, where metadata is stored in the fields of the model.

    When used out of the box, the metadata class will adobpt dictionary-like behavior. You
    may easily access different fields of the metadata as if it were a dictionary:
    ```python
    # Instantiate it with any kwargs you like
    metadata = BaseMetadata(foo-'bar', cow='moo')

    metadata["foo"]  # "bar"
    metadata["cow"]  # "moo"

    # Update the value of the key
    metadata["foo"] = "fighters"

    # Set new key-value pairs
    metadata['sheep'] = 'baa'
    ```

    Otherwise, you may sub-class the metadata class in order to create a more strictly typed
    metadata model. This is useful when you want to enforce a specific structure for your metadata.

    ```python
    class CustomMetadata(BaseMetadata):
        foo: str
        cow: str

    # Instantiate it with the required fields
    metadata = CustomMetadata(foo='bar', cow='moo')

    metadata.foo  # "bar"
    metadata.cow  # "moo"

    # Update the value of the key
    metadata.foo = "fighters"

    # Use the extra field to store dynamic metadata
    metadata.extra['sheep'] = 'baa'
    ```

    Additionally, the task results descriptor allows for controlled and easy access to the task results
    of various tasks that are run on the parent node.
    """

    extra: Dict[str, Any] = Field(..., default_factory=dict, repr=False)

    _task_results: TaskResultsDescriptor = PrivateAttr(
        default_factory=TaskResultsDescriptor
    )

    _owner: TMetadataOwner = PrivateAttr()

    @property
    def task_results(self) -> TaskResultsDescriptor:
        """Return the task results descriptor."""
        return self._task_results.__get__(self)

    @task_results.setter
    def task_results(self, value: Any) -> None:
        """This will raise an error, as we do not want to set the task results directly.

        NOTE: This implementation is here purely to avoid the task_results property from being
        overwritten by accident.
        """
        self._task_results.__set__(self, value)

    @property
    def owner(self) -> TMetadataOwner:
        """Return the owner of the metadata."""
        return self._owner

    @owner.setter
    def owner(self, owner: TMetadataOwner) -> None:
        """Return the owner of the metadata."""
        self._owner = owner

    @classmethod
    def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
        """Create a new instance of the metadata class with the owner set."""
        metadata = cls(**data)
        metadata.owner = owner
        return metadata

    @model_validator(mode="before")
    @classmethod
    def validate_data_fields_from_annotations(cls, data: Any) -> Any:
        """Validate the data fields from the annotations."""

        # We want to make sure that we combine the `extra` metdata along with any
        # other specific fields that are defined in the metadata.
        extra = data.pop("extra", {})
        assert isinstance(extra, dict), "The `extra` field must be a dictionary."
        data = {**data, **extra}

        # If the model has been sub-classed, then all of our fields must be
        # validated by the pydantic model.
        if cls._is_field_typed():
            # We will get the fields out of extra and set them as potential fields to
            # validate. They will be ignored if they are not defined in the model, but it
            # allows for a more flexible way to define metadata.
            # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
            return {**data, "extra": extra}

        # Otherwise, we are using our mock-dict implentation, so we store our
        # metadata in the `extra` field.
        return {"extra": data}

    @classmethod
    def _is_field_typed(cls):
        """
        Check if the metadata model is field typed.

        This is used to determine if the metadata model is a dictionary-like model,
        or a more strictly typed model.
        """
        if set(["extra"]) != set(cls.model_fields.keys()):
            return True

        return False

    def __repr__(self):
        """
        Provide a string representation of the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __repr__ method.
        """
        if self._is_field_typed():
            return super().__repr__()

        # Otherwise, we are deailing with dictornary-like metadata
        return json.dumps(self.extra)

    def __getitem__(self, name):
        """
        Provide dictionary functionlaity to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __getitem__ method.
        """
        if not self._is_field_typed():
            return self.extra[name]

        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

    def __setitem__(self, name, value):
        """
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __setitem__ method.
        """
        if not self._is_field_typed():
            self.extra[name] = value
        else:
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"
            )

    def __delitem__(self, name):
        """
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __delitem__ method.
        """
        if not self._is_field_typed():
            del self.extra[name]
        else:
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"
            )

    def __iter__(self):
        """
        Iterate over the keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __iter__ method.
        """
        if self._is_field_typed():
            raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

        return iter(self.extra)

    def __len__(self):
        """
        Get the number of keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __len__ method.
        """
        if self._is_field_typed():
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '__len__'"
            )

        return len(self.extra)

    def __getattr__(self, name):
        """
        Allow for getting of attributes on the metadata class.

        The attributes are retrieved through the following heirarchy:
            - If the model is sub-classed, it will be retrieved as normal.
            - Otherwise, if the attribute is private, it will be retrieved as normal.
            - Finally, if we are getting a public attribute on the base metadata class,
                we use the extra field.
            - If the key is not set in the `extra` dictionary, we resort back to just
            trying to get the field.
                - This is when we grab the `owner` or `task_result` attribuite.
        """
        if self._is_field_typed():
            return super().__getattr__(name)

        if name.startswith("_"):
            return super().__getattr__(name)

        # Attempt to retreieve the attr from the `extra` field
        try:
            return self.extra.get(name)

        except KeyError:
            # This is for grabbing properties on the base metadata class
            return super().__getattr__(name)

    def __setattr__(self, name: str, value: Any) -> None:
        """
        Allow for setting of attributes on the metadata class.

        The attributes are set through the following heirarchy:
            - If the model is sub-classed, it will be set as normal.
            - Otherwise, if the attribute is private, it will be set as normal.
            - Finally, if we are setting a public attribute on the base metadata class,
                we use the extra field.
        """
        if self._is_field_typed():
            return super().__setattr__(name, value)

        # We want to avoid setting any private attributes in the extra
        # dictionary
        if name.startswith("_"):
            return super().__setattr__(name, value)

        # If it is `owner` or `task_results`, we want
        # to avoid setting the attribute in the `extra` dictionary
        if name in ["owner", "task_results"]:
            return super().__setattr__(name, value)

        self.extra[name] = value

    def __delattr__(self, name: str) -> None:
        """
        Ensure that we can delete attributes from the metadata class.

        The attributes are deleted through the following heirarchy:
            - If the attribute is `task_results`, we use the descriptor to delete the task results.
            - Otherwise, if it is a sub-classed model, it will be deleted as normal.
            - Finally, if we are deleting a public attribute on the base metadata class,
                we use the extra field.
        """

        # We want to use the descriptor to delete the task results
        if name == "task_results":
            self._task_results.__delete__(self)
            return

        # Otherwise, we use our standard fallback tiers
        if self._is_field_typed():
            return super().__delattr__(name)

        del self.extra[name]

owner: TMetadataOwner property writable

Return the owner of the metadata.

task_results: TaskResultsDescriptor property writable

Return the task results descriptor.

__delattr__(name)

Ensure that we can delete attributes from the metadata class.

The attributes are deleted through the following heirarchy
  • If the attribute is task_results, we use the descriptor to delete the task results.
  • Otherwise, if it is a sub-classed model, it will be deleted as normal.
  • Finally, if we are deleting a public attribute on the base metadata class, we use the extra field.
Source code in docprompt/schema/pipeline/metadata.py
def __delattr__(self, name: str) -> None:
    """
    Ensure that we can delete attributes from the metadata class.

    The attributes are deleted through the following heirarchy:
        - If the attribute is `task_results`, we use the descriptor to delete the task results.
        - Otherwise, if it is a sub-classed model, it will be deleted as normal.
        - Finally, if we are deleting a public attribute on the base metadata class,
            we use the extra field.
    """

    # We want to use the descriptor to delete the task results
    if name == "task_results":
        self._task_results.__delete__(self)
        return

    # Otherwise, we use our standard fallback tiers
    if self._is_field_typed():
        return super().__delattr__(name)

    del self.extra[name]

__delitem__(name)

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an delitem method.

Source code in docprompt/schema/pipeline/metadata.py
def __delitem__(self, name):
    """
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __delitem__ method.
    """
    if not self._is_field_typed():
        del self.extra[name]
    else:
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

__getattr__(name)

Allow for getting of attributes on the metadata class.

The attributes are retrieved through the following heirarchy
  • If the model is sub-classed, it will be retrieved as normal.
  • Otherwise, if the attribute is private, it will be retrieved as normal.
  • Finally, if we are getting a public attribute on the base metadata class, we use the extra field.
  • If the key is not set in the extra dictionary, we resort back to just trying to get the field.
    • This is when we grab the owner or task_result attribuite.
Source code in docprompt/schema/pipeline/metadata.py
def __getattr__(self, name):
    """
    Allow for getting of attributes on the metadata class.

    The attributes are retrieved through the following heirarchy:
        - If the model is sub-classed, it will be retrieved as normal.
        - Otherwise, if the attribute is private, it will be retrieved as normal.
        - Finally, if we are getting a public attribute on the base metadata class,
            we use the extra field.
        - If the key is not set in the `extra` dictionary, we resort back to just
        trying to get the field.
            - This is when we grab the `owner` or `task_result` attribuite.
    """
    if self._is_field_typed():
        return super().__getattr__(name)

    if name.startswith("_"):
        return super().__getattr__(name)

    # Attempt to retreieve the attr from the `extra` field
    try:
        return self.extra.get(name)

    except KeyError:
        # This is for grabbing properties on the base metadata class
        return super().__getattr__(name)

__getitem__(name)

Provide dictionary functionlaity to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an getitem method.

Source code in docprompt/schema/pipeline/metadata.py
def __getitem__(self, name):
    """
    Provide dictionary functionlaity to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __getitem__ method.
    """
    if not self._is_field_typed():
        return self.extra[name]

    raise AttributeError(
        f"'{self.__class__.__name__}' object has no attribute '{name}'"
    )

__iter__()

Iterate over the keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an iter method.

Source code in docprompt/schema/pipeline/metadata.py
def __iter__(self):
    """
    Iterate over the keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __iter__ method.
    """
    if self._is_field_typed():
        raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

    return iter(self.extra)

__len__()

Get the number of keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a len method.

Source code in docprompt/schema/pipeline/metadata.py
def __len__(self):
    """
    Get the number of keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __len__ method.
    """
    if self._is_field_typed():
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '__len__'"
        )

    return len(self.extra)

__repr__()

Provide a string representation of the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a repr method.

Source code in docprompt/schema/pipeline/metadata.py
def __repr__(self):
    """
    Provide a string representation of the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __repr__ method.
    """
    if self._is_field_typed():
        return super().__repr__()

    # Otherwise, we are deailing with dictornary-like metadata
    return json.dumps(self.extra)

__setattr__(name, value)

Allow for setting of attributes on the metadata class.

The attributes are set through the following heirarchy
  • If the model is sub-classed, it will be set as normal.
  • Otherwise, if the attribute is private, it will be set as normal.
  • Finally, if we are setting a public attribute on the base metadata class, we use the extra field.
Source code in docprompt/schema/pipeline/metadata.py
def __setattr__(self, name: str, value: Any) -> None:
    """
    Allow for setting of attributes on the metadata class.

    The attributes are set through the following heirarchy:
        - If the model is sub-classed, it will be set as normal.
        - Otherwise, if the attribute is private, it will be set as normal.
        - Finally, if we are setting a public attribute on the base metadata class,
            we use the extra field.
    """
    if self._is_field_typed():
        return super().__setattr__(name, value)

    # We want to avoid setting any private attributes in the extra
    # dictionary
    if name.startswith("_"):
        return super().__setattr__(name, value)

    # If it is `owner` or `task_results`, we want
    # to avoid setting the attribute in the `extra` dictionary
    if name in ["owner", "task_results"]:
        return super().__setattr__(name, value)

    self.extra[name] = value

__setitem__(name, value)

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an setitem method.

Source code in docprompt/schema/pipeline/metadata.py
def __setitem__(self, name, value):
    """
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __setitem__ method.
    """
    if not self._is_field_typed():
        self.extra[name] = value
    else:
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

from_owner(owner, **data) classmethod

Create a new instance of the metadata class with the owner set.

Source code in docprompt/schema/pipeline/metadata.py
@classmethod
def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
    """Create a new instance of the metadata class with the owner set."""
    metadata = cls(**data)
    metadata.owner = owner
    return metadata

validate_data_fields_from_annotations(data) classmethod

Validate the data fields from the annotations.

Source code in docprompt/schema/pipeline/metadata.py
@model_validator(mode="before")
@classmethod
def validate_data_fields_from_annotations(cls, data: Any) -> Any:
    """Validate the data fields from the annotations."""

    # We want to make sure that we combine the `extra` metdata along with any
    # other specific fields that are defined in the metadata.
    extra = data.pop("extra", {})
    assert isinstance(extra, dict), "The `extra` field must be a dictionary."
    data = {**data, **extra}

    # If the model has been sub-classed, then all of our fields must be
    # validated by the pydantic model.
    if cls._is_field_typed():
        # We will get the fields out of extra and set them as potential fields to
        # validate. They will be ignored if they are not defined in the model, but it
        # allows for a more flexible way to define metadata.
        # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
        return {**data, "extra": extra}

    # Otherwise, we are using our mock-dict implentation, so we store our
    # metadata in the `extra` field.
    return {"extra": data}

DocumentCollection

Bases: BaseModel, Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata]

Represents a collection of documents with some common metadata

Source code in docprompt/schema/pipeline/node/collection.py
class DocumentCollection(
    BaseModel,
    Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata],
):
    """
    Represents a collection of documents with some common metadata
    """

    document_nodes: List["DocumentNode[DocumentNodeMetadata, PageNodeMetadata]"]
    metadata: DocumentCollectionMetadata = Field(..., default_factory=dict)

DocumentNode

Bases: BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]

Represents a single document, with some metadata

Source code in docprompt/schema/pipeline/node/document.py
class DocumentNode(BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]):
    """
    Represents a single document, with some metadata
    """

    document: PdfDocument
    page_nodes: List[PageNode[PageNodeMetadata]] = Field(
        description="The pages in the document", default_factory=list, repr=False
    )
    metadata: DocumentNodeMetadata = Field(
        description="Application-specific metadata for the document",
        default_factory=BaseMetadata,
    )

    _locator: Optional["DocumentProvenanceLocator"] = PrivateAttr(default=None)

    _persistance_path: Optional[str] = PrivateAttr(default=None)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_locator"] = None

        return state

    def __len__(self):
        return len(self.page_nodes)

    def __getitem__(self, index):
        return self.page_nodes[index]

    def __iter__(self):
        return iter(self.page_nodes)

    @property
    def rasterizer(self):
        return DocumentRasterizer(self)

    @property
    def locator(self):
        if self._locator is None:
            self.refresh_locator()

        return self._locator

    def refresh_locator(self):
        """
        Refreshes the locator for this document node
        """
        from docprompt.provenance.search import DocumentProvenanceLocator

        if any(not page.ocr_results.result for page in self.page_nodes):
            raise ValueError(
                "Cannot create a locator for a document node with missing OCR results"
            )

        self._locator = DocumentProvenanceLocator.from_document_node(self)

        return self.locator

    @classmethod
    def from_document(
        cls,
        document: PdfDocument,
        document_metadata: Optional[DocumentNodeMetadata] = None,
        page_metadata: Optional[List[PageNodeMetadata]] = None,
    ):
        document_node: "DocumentNode[DocumentNodeMetadata, PageNodeMetadata]" = cls(
            document=document,
        )
        document_node.metadata = document_metadata or cls.metadata_class().from_owner(
            document_node, **{}
        )

        if page_metadata is not None and len(page_metadata) != len(document):
            raise ValueError(
                "The number of page metadata items must match the number of pages in the document."
            )

        for page_number in range(1, len(document) + 1):
            if page_metadata is not None:
                page_node = PageNode(
                    document=document_node,
                    page_number=page_number,
                    metadata=page_metadata[page_number - 1],
                )
            else:
                page_node = PageNode(document=document_node, page_number=page_number)

            document_node.page_nodes.append(page_node)

        return document_node

    @property
    def file_hash(self):
        return self.document.document_hash

    @property
    def document_name(self):
        return self.document.name

    @classmethod
    def metadata_class(cls) -> Type[BaseMetadata]:
        """Get the metadata class for instantiating metadata from the model."""

        fields = cls.model_fields

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        metadata_field_annotation = fields["metadata"].annotation

        # If no override has been provided to the metadata model, we want to retrieve
        # it as a TypedDict
        if metadata_field_annotation == DocumentNodeMetadata:
            return BaseMetadata

        if isinstance(metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
            )

        # Get the overriden Generic type of th DocumentNodeMetadata
        return metadata_field_annotation

    @classmethod
    def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
        """Get the metadata class for the page nodes in the document."""
        fields = cls.model_fields

        # NOTE: The indexing is important here, and it allows us to get the type of each
        # page node in the `List` annotation
        page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
            "metadata"
        ].annotation

        if page_node_metadata_field_annotation == PageNodeMetadata:
            return BaseMetadata

        if isinstance(page_node_metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define PageNode with a ForwardRef for Generic metadata model types."
            )

        return page_node_metadata_field_annotation

    @property
    def persistance_path(self):
        """The base path to storage location."""
        return self._persistance_path

    @persistance_path.setter
    def persistance_path(self, path: str):
        """Set the base path to storage location."""
        self._persistance_path = path

    @classmethod
    def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
        """Load the document node from storage.

        Args:
            path (str): The base path to storage location.
                - Example (S3): "s3://bucket-name/key/to/folder"
                - Example (Local FS): "/tmp/docprompt/storage"
            file_hash (str): The hash of the document.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            DocumentNode: The loaded document node.
        """

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
            file_hash, **kwargs
        )

        doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
        node = cls.from_document(doc)

        if metadata_bytes:
            metadata_json = json.loads(metadata_bytes.decode("utf-8"))
            metadata = cls.metadata_class().from_owner(node, **metadata_json)
        else:
            metadata = cls.metadata_class().from_owner(node, **{})

        if page_metadata_bytes:
            page_metadata_json = [
                json.loads(page_str)
                for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
            ]
            page_metadata = [
                cls.page_metadata_class()(**page) for page in page_metadata_json
            ]
        else:
            page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

        # Store the metadata on the node and page nodes
        node.metadata = metadata
        for page, meta in zip(node.page_nodes, page_metadata):
            meta.set_owner(page)
            page.metadata = meta

        # Make sure to set the persistance path on the node
        node.persistance_path = path

        return node

    def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
        """Persist a document node to storage.

        Args:
            path (Optional[str]): Overwrites the current `persistance_path` property
                - If `persistance_path` is not currently set, path must be provided.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            FileSidecarsPathManager: The file path manager for the persisted document node.
        """

        path = path or self.persistance_path

        if path is None:
            raise ValueError("The path must be provided to persist the document node.")

        # Make sure to update the persistance path
        self.persistance_path = path

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes = self.document.get_bytes()
        metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
        page_metadata_bytes = bytes(
            json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
            encoding="utf-8",
        )

        return fs_manager.write(
            pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
        )

persistance_path property writable

The base path to storage location.

from_storage(path, file_hash, **kwargs) classmethod

Load the document node from storage.

Parameters:

Name Type Description Default
path str

The base path to storage location. - Example (S3): "s3://bucket-name/key/to/folder" - Example (Local FS): "/tmp/docprompt/storage"

required
file_hash str

The hash of the document.

required
**kwargs

Additional keyword arguments for fsspec FileSystem

{}

Returns:

Name Type Description
DocumentNode Self

The loaded document node.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
    """Load the document node from storage.

    Args:
        path (str): The base path to storage location.
            - Example (S3): "s3://bucket-name/key/to/folder"
            - Example (Local FS): "/tmp/docprompt/storage"
        file_hash (str): The hash of the document.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        DocumentNode: The loaded document node.
    """

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
        file_hash, **kwargs
    )

    doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
    node = cls.from_document(doc)

    if metadata_bytes:
        metadata_json = json.loads(metadata_bytes.decode("utf-8"))
        metadata = cls.metadata_class().from_owner(node, **metadata_json)
    else:
        metadata = cls.metadata_class().from_owner(node, **{})

    if page_metadata_bytes:
        page_metadata_json = [
            json.loads(page_str)
            for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
        ]
        page_metadata = [
            cls.page_metadata_class()(**page) for page in page_metadata_json
        ]
    else:
        page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

    # Store the metadata on the node and page nodes
    node.metadata = metadata
    for page, meta in zip(node.page_nodes, page_metadata):
        meta.set_owner(page)
        page.metadata = meta

    # Make sure to set the persistance path on the node
    node.persistance_path = path

    return node

metadata_class() classmethod

Get the metadata class for instantiating metadata from the model.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def metadata_class(cls) -> Type[BaseMetadata]:
    """Get the metadata class for instantiating metadata from the model."""

    fields = cls.model_fields

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    metadata_field_annotation = fields["metadata"].annotation

    # If no override has been provided to the metadata model, we want to retrieve
    # it as a TypedDict
    if metadata_field_annotation == DocumentNodeMetadata:
        return BaseMetadata

    if isinstance(metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
        )

    # Get the overriden Generic type of th DocumentNodeMetadata
    return metadata_field_annotation

page_metadata_class() classmethod

Get the metadata class for the page nodes in the document.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
    """Get the metadata class for the page nodes in the document."""
    fields = cls.model_fields

    # NOTE: The indexing is important here, and it allows us to get the type of each
    # page node in the `List` annotation
    page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
        "metadata"
    ].annotation

    if page_node_metadata_field_annotation == PageNodeMetadata:
        return BaseMetadata

    if isinstance(page_node_metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define PageNode with a ForwardRef for Generic metadata model types."
        )

    return page_node_metadata_field_annotation

persist(path=None, **kwargs)

Persist a document node to storage.

Parameters:

Name Type Description Default
path Optional[str]

Overwrites the current persistance_path property - If persistance_path is not currently set, path must be provided.

None
**kwargs

Additional keyword arguments for fsspec FileSystem

{}

Returns:

Name Type Description
FileSidecarsPathManager FileSidecarsPathManager

The file path manager for the persisted document node.

Source code in docprompt/schema/pipeline/node/document.py
def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
    """Persist a document node to storage.

    Args:
        path (Optional[str]): Overwrites the current `persistance_path` property
            - If `persistance_path` is not currently set, path must be provided.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        FileSidecarsPathManager: The file path manager for the persisted document node.
    """

    path = path or self.persistance_path

    if path is None:
        raise ValueError("The path must be provided to persist the document node.")

    # Make sure to update the persistance path
    self.persistance_path = path

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes = self.document.get_bytes()
    metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
    page_metadata_bytes = bytes(
        json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
        encoding="utf-8",
    )

    return fs_manager.write(
        pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
    )

refresh_locator()

Refreshes the locator for this document node

Source code in docprompt/schema/pipeline/node/document.py
def refresh_locator(self):
    """
    Refreshes the locator for this document node
    """
    from docprompt.provenance.search import DocumentProvenanceLocator

    if any(not page.ocr_results.result for page in self.page_nodes):
        raise ValueError(
            "Cannot create a locator for a document node with missing OCR results"
        )

    self._locator = DocumentProvenanceLocator.from_document_node(self)

    return self.locator

PageNode

Bases: BaseNode, Generic[PageNodeMetadata]

Represents a single page in a document, with some metadata

Source code in docprompt/schema/pipeline/node/page.py
class PageNode(BaseNode, Generic[PageNodeMetadata]):
    """
    Represents a single page in a document, with some metadata
    """

    document: "DocumentNode" = Field(exclude=True, repr=False)
    page_number: PositiveInt = Field(description="The page number")
    metadata: PageNodeMetadata = Field(
        description="Application-specific metadata for the page",
        default_factory=BaseMetadata,
    )
    extra: Dict[str, Any] = Field(
        description="Extra data that can be stored on the page node",
        default_factory=dict,
    )

    ocr_results: ResultContainer[OcrPageResult] = Field(
        default_factory=_result_container_factory,
        description="The OCR results for the page",
        repr=False,
    )

    _raster_cache: Dict[str, bytes] = PrivateAttr(default_factory=dict)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_raster_cache"] = {}

        return state

    @property
    def rasterizer(self):
        return PageRasterizer(self._raster_cache, self)

    def search(
        self, query: str, refine_to_words: bool = True, require_exact_match: bool = True
    ):
        return self.document.locator.search(
            query,
            page_number=self.page_number,
            refine_to_word=refine_to_words,
            require_exact_match=require_exact_match,
        )

metadata

The metadata class is utilized for defining a basic, yet flexible interface for metadata attached to various fields.

In essence, this allows for developers to choose to either create their metadtata in an unstructured manner (i.e. a dictionary), or to sub class the base metadata class in order to create a more strictly typed metadata model for their page and document nodes.

BaseMetadata

Bases: BaseModel, MutableMapping, Generic[TMetadataOwner]

The base metadata class is utilized for defining a basic yet flexible interface for metadata attached to various fields.

When used out of the box, the metadata class will adobpt dictionary-like behavior. You may easily access different fields of the metadata as if it were a dictionary:

# Instantiate it with any kwargs you like
metadata = BaseMetadata(foo-'bar', cow='moo')

metadata["foo"]  # "bar"
metadata["cow"]  # "moo"

# Update the value of the key
metadata["foo"] = "fighters"

# Set new key-value pairs
metadata['sheep'] = 'baa'

Otherwise, you may sub-class the metadata class in order to create a more strictly typed metadata model. This is useful when you want to enforce a specific structure for your metadata.

class CustomMetadata(BaseMetadata):
    foo: str
    cow: str

# Instantiate it with the required fields
metadata = CustomMetadata(foo='bar', cow='moo')

metadata.foo  # "bar"
metadata.cow  # "moo"

# Update the value of the key
metadata.foo = "fighters"

# Use the extra field to store dynamic metadata
metadata.extra['sheep'] = 'baa'

Additionally, the task results descriptor allows for controlled and easy access to the task results of various tasks that are run on the parent node.

Source code in docprompt/schema/pipeline/metadata.py
class BaseMetadata(BaseModel, MutableMapping, Generic[TMetadataOwner]):
    """
    The base metadata class is utilized for defining a basic yet flexible interface
    for metadata attached to various fields.

    The metadata class can be used in two ways:
        1. As a dictionary-like object, where metadata is stored in the `extra` field.
        2. As a sub-classed model, where metadata is stored in the fields of the model.

    When used out of the box, the metadata class will adobpt dictionary-like behavior. You
    may easily access different fields of the metadata as if it were a dictionary:
    ```python
    # Instantiate it with any kwargs you like
    metadata = BaseMetadata(foo-'bar', cow='moo')

    metadata["foo"]  # "bar"
    metadata["cow"]  # "moo"

    # Update the value of the key
    metadata["foo"] = "fighters"

    # Set new key-value pairs
    metadata['sheep'] = 'baa'
    ```

    Otherwise, you may sub-class the metadata class in order to create a more strictly typed
    metadata model. This is useful when you want to enforce a specific structure for your metadata.

    ```python
    class CustomMetadata(BaseMetadata):
        foo: str
        cow: str

    # Instantiate it with the required fields
    metadata = CustomMetadata(foo='bar', cow='moo')

    metadata.foo  # "bar"
    metadata.cow  # "moo"

    # Update the value of the key
    metadata.foo = "fighters"

    # Use the extra field to store dynamic metadata
    metadata.extra['sheep'] = 'baa'
    ```

    Additionally, the task results descriptor allows for controlled and easy access to the task results
    of various tasks that are run on the parent node.
    """

    extra: Dict[str, Any] = Field(..., default_factory=dict, repr=False)

    _task_results: TaskResultsDescriptor = PrivateAttr(
        default_factory=TaskResultsDescriptor
    )

    _owner: TMetadataOwner = PrivateAttr()

    @property
    def task_results(self) -> TaskResultsDescriptor:
        """Return the task results descriptor."""
        return self._task_results.__get__(self)

    @task_results.setter
    def task_results(self, value: Any) -> None:
        """This will raise an error, as we do not want to set the task results directly.

        NOTE: This implementation is here purely to avoid the task_results property from being
        overwritten by accident.
        """
        self._task_results.__set__(self, value)

    @property
    def owner(self) -> TMetadataOwner:
        """Return the owner of the metadata."""
        return self._owner

    @owner.setter
    def owner(self, owner: TMetadataOwner) -> None:
        """Return the owner of the metadata."""
        self._owner = owner

    @classmethod
    def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
        """Create a new instance of the metadata class with the owner set."""
        metadata = cls(**data)
        metadata.owner = owner
        return metadata

    @model_validator(mode="before")
    @classmethod
    def validate_data_fields_from_annotations(cls, data: Any) -> Any:
        """Validate the data fields from the annotations."""

        # We want to make sure that we combine the `extra` metdata along with any
        # other specific fields that are defined in the metadata.
        extra = data.pop("extra", {})
        assert isinstance(extra, dict), "The `extra` field must be a dictionary."
        data = {**data, **extra}

        # If the model has been sub-classed, then all of our fields must be
        # validated by the pydantic model.
        if cls._is_field_typed():
            # We will get the fields out of extra and set them as potential fields to
            # validate. They will be ignored if they are not defined in the model, but it
            # allows for a more flexible way to define metadata.
            # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
            return {**data, "extra": extra}

        # Otherwise, we are using our mock-dict implentation, so we store our
        # metadata in the `extra` field.
        return {"extra": data}

    @classmethod
    def _is_field_typed(cls):
        """
        Check if the metadata model is field typed.

        This is used to determine if the metadata model is a dictionary-like model,
        or a more strictly typed model.
        """
        if set(["extra"]) != set(cls.model_fields.keys()):
            return True

        return False

    def __repr__(self):
        """
        Provide a string representation of the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __repr__ method.
        """
        if self._is_field_typed():
            return super().__repr__()

        # Otherwise, we are deailing with dictornary-like metadata
        return json.dumps(self.extra)

    def __getitem__(self, name):
        """
        Provide dictionary functionlaity to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __getitem__ method.
        """
        if not self._is_field_typed():
            return self.extra[name]

        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

    def __setitem__(self, name, value):
        """
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __setitem__ method.
        """
        if not self._is_field_typed():
            self.extra[name] = value
        else:
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"
            )

    def __delitem__(self, name):
        """
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __delitem__ method.
        """
        if not self._is_field_typed():
            del self.extra[name]
        else:
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"
            )

    def __iter__(self):
        """
        Iterate over the keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __iter__ method.
        """
        if self._is_field_typed():
            raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

        return iter(self.extra)

    def __len__(self):
        """
        Get the number of keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __len__ method.
        """
        if self._is_field_typed():
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '__len__'"
            )

        return len(self.extra)

    def __getattr__(self, name):
        """
        Allow for getting of attributes on the metadata class.

        The attributes are retrieved through the following heirarchy:
            - If the model is sub-classed, it will be retrieved as normal.
            - Otherwise, if the attribute is private, it will be retrieved as normal.
            - Finally, if we are getting a public attribute on the base metadata class,
                we use the extra field.
            - If the key is not set in the `extra` dictionary, we resort back to just
            trying to get the field.
                - This is when we grab the `owner` or `task_result` attribuite.
        """
        if self._is_field_typed():
            return super().__getattr__(name)

        if name.startswith("_"):
            return super().__getattr__(name)

        # Attempt to retreieve the attr from the `extra` field
        try:
            return self.extra.get(name)

        except KeyError:
            # This is for grabbing properties on the base metadata class
            return super().__getattr__(name)

    def __setattr__(self, name: str, value: Any) -> None:
        """
        Allow for setting of attributes on the metadata class.

        The attributes are set through the following heirarchy:
            - If the model is sub-classed, it will be set as normal.
            - Otherwise, if the attribute is private, it will be set as normal.
            - Finally, if we are setting a public attribute on the base metadata class,
                we use the extra field.
        """
        if self._is_field_typed():
            return super().__setattr__(name, value)

        # We want to avoid setting any private attributes in the extra
        # dictionary
        if name.startswith("_"):
            return super().__setattr__(name, value)

        # If it is `owner` or `task_results`, we want
        # to avoid setting the attribute in the `extra` dictionary
        if name in ["owner", "task_results"]:
            return super().__setattr__(name, value)

        self.extra[name] = value

    def __delattr__(self, name: str) -> None:
        """
        Ensure that we can delete attributes from the metadata class.

        The attributes are deleted through the following heirarchy:
            - If the attribute is `task_results`, we use the descriptor to delete the task results.
            - Otherwise, if it is a sub-classed model, it will be deleted as normal.
            - Finally, if we are deleting a public attribute on the base metadata class,
                we use the extra field.
        """

        # We want to use the descriptor to delete the task results
        if name == "task_results":
            self._task_results.__delete__(self)
            return

        # Otherwise, we use our standard fallback tiers
        if self._is_field_typed():
            return super().__delattr__(name)

        del self.extra[name]

owner: TMetadataOwner property writable

Return the owner of the metadata.

task_results: TaskResultsDescriptor property writable

Return the task results descriptor.

__delattr__(name)

Ensure that we can delete attributes from the metadata class.

The attributes are deleted through the following heirarchy
  • If the attribute is task_results, we use the descriptor to delete the task results.
  • Otherwise, if it is a sub-classed model, it will be deleted as normal.
  • Finally, if we are deleting a public attribute on the base metadata class, we use the extra field.
Source code in docprompt/schema/pipeline/metadata.py
def __delattr__(self, name: str) -> None:
    """
    Ensure that we can delete attributes from the metadata class.

    The attributes are deleted through the following heirarchy:
        - If the attribute is `task_results`, we use the descriptor to delete the task results.
        - Otherwise, if it is a sub-classed model, it will be deleted as normal.
        - Finally, if we are deleting a public attribute on the base metadata class,
            we use the extra field.
    """

    # We want to use the descriptor to delete the task results
    if name == "task_results":
        self._task_results.__delete__(self)
        return

    # Otherwise, we use our standard fallback tiers
    if self._is_field_typed():
        return super().__delattr__(name)

    del self.extra[name]

__delitem__(name)

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an delitem method.

Source code in docprompt/schema/pipeline/metadata.py
def __delitem__(self, name):
    """
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __delitem__ method.
    """
    if not self._is_field_typed():
        del self.extra[name]
    else:
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

__getattr__(name)

Allow for getting of attributes on the metadata class.

The attributes are retrieved through the following heirarchy
  • If the model is sub-classed, it will be retrieved as normal.
  • Otherwise, if the attribute is private, it will be retrieved as normal.
  • Finally, if we are getting a public attribute on the base metadata class, we use the extra field.
  • If the key is not set in the extra dictionary, we resort back to just trying to get the field.
    • This is when we grab the owner or task_result attribuite.
Source code in docprompt/schema/pipeline/metadata.py
def __getattr__(self, name):
    """
    Allow for getting of attributes on the metadata class.

    The attributes are retrieved through the following heirarchy:
        - If the model is sub-classed, it will be retrieved as normal.
        - Otherwise, if the attribute is private, it will be retrieved as normal.
        - Finally, if we are getting a public attribute on the base metadata class,
            we use the extra field.
        - If the key is not set in the `extra` dictionary, we resort back to just
        trying to get the field.
            - This is when we grab the `owner` or `task_result` attribuite.
    """
    if self._is_field_typed():
        return super().__getattr__(name)

    if name.startswith("_"):
        return super().__getattr__(name)

    # Attempt to retreieve the attr from the `extra` field
    try:
        return self.extra.get(name)

    except KeyError:
        # This is for grabbing properties on the base metadata class
        return super().__getattr__(name)

__getitem__(name)

Provide dictionary functionlaity to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an getitem method.

Source code in docprompt/schema/pipeline/metadata.py
def __getitem__(self, name):
    """
    Provide dictionary functionlaity to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __getitem__ method.
    """
    if not self._is_field_typed():
        return self.extra[name]

    raise AttributeError(
        f"'{self.__class__.__name__}' object has no attribute '{name}'"
    )

__iter__()

Iterate over the keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an iter method.

Source code in docprompt/schema/pipeline/metadata.py
def __iter__(self):
    """
    Iterate over the keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __iter__ method.
    """
    if self._is_field_typed():
        raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

    return iter(self.extra)

__len__()

Get the number of keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a len method.

Source code in docprompt/schema/pipeline/metadata.py
def __len__(self):
    """
    Get the number of keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __len__ method.
    """
    if self._is_field_typed():
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '__len__'"
        )

    return len(self.extra)

__repr__()

Provide a string representation of the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a repr method.

Source code in docprompt/schema/pipeline/metadata.py
def __repr__(self):
    """
    Provide a string representation of the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __repr__ method.
    """
    if self._is_field_typed():
        return super().__repr__()

    # Otherwise, we are deailing with dictornary-like metadata
    return json.dumps(self.extra)

__setattr__(name, value)

Allow for setting of attributes on the metadata class.

The attributes are set through the following heirarchy
  • If the model is sub-classed, it will be set as normal.
  • Otherwise, if the attribute is private, it will be set as normal.
  • Finally, if we are setting a public attribute on the base metadata class, we use the extra field.
Source code in docprompt/schema/pipeline/metadata.py
def __setattr__(self, name: str, value: Any) -> None:
    """
    Allow for setting of attributes on the metadata class.

    The attributes are set through the following heirarchy:
        - If the model is sub-classed, it will be set as normal.
        - Otherwise, if the attribute is private, it will be set as normal.
        - Finally, if we are setting a public attribute on the base metadata class,
            we use the extra field.
    """
    if self._is_field_typed():
        return super().__setattr__(name, value)

    # We want to avoid setting any private attributes in the extra
    # dictionary
    if name.startswith("_"):
        return super().__setattr__(name, value)

    # If it is `owner` or `task_results`, we want
    # to avoid setting the attribute in the `extra` dictionary
    if name in ["owner", "task_results"]:
        return super().__setattr__(name, value)

    self.extra[name] = value

__setitem__(name, value)

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an setitem method.

Source code in docprompt/schema/pipeline/metadata.py
def __setitem__(self, name, value):
    """
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __setitem__ method.
    """
    if not self._is_field_typed():
        self.extra[name] = value
    else:
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

from_owner(owner, **data) classmethod

Create a new instance of the metadata class with the owner set.

Source code in docprompt/schema/pipeline/metadata.py
@classmethod
def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
    """Create a new instance of the metadata class with the owner set."""
    metadata = cls(**data)
    metadata.owner = owner
    return metadata

validate_data_fields_from_annotations(data) classmethod

Validate the data fields from the annotations.

Source code in docprompt/schema/pipeline/metadata.py
@model_validator(mode="before")
@classmethod
def validate_data_fields_from_annotations(cls, data: Any) -> Any:
    """Validate the data fields from the annotations."""

    # We want to make sure that we combine the `extra` metdata along with any
    # other specific fields that are defined in the metadata.
    extra = data.pop("extra", {})
    assert isinstance(extra, dict), "The `extra` field must be a dictionary."
    data = {**data, **extra}

    # If the model has been sub-classed, then all of our fields must be
    # validated by the pydantic model.
    if cls._is_field_typed():
        # We will get the fields out of extra and set them as potential fields to
        # validate. They will be ignored if they are not defined in the model, but it
        # allows for a more flexible way to define metadata.
        # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
        return {**data, "extra": extra}

    # Otherwise, we are using our mock-dict implentation, so we store our
    # metadata in the `extra` field.
    return {"extra": data}

node

DocumentCollection

Bases: BaseModel, Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata]

Represents a collection of documents with some common metadata

Source code in docprompt/schema/pipeline/node/collection.py
class DocumentCollection(
    BaseModel,
    Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata],
):
    """
    Represents a collection of documents with some common metadata
    """

    document_nodes: List["DocumentNode[DocumentNodeMetadata, PageNodeMetadata]"]
    metadata: DocumentCollectionMetadata = Field(..., default_factory=dict)

DocumentNode

Bases: BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]

Represents a single document, with some metadata

Source code in docprompt/schema/pipeline/node/document.py
class DocumentNode(BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]):
    """
    Represents a single document, with some metadata
    """

    document: PdfDocument
    page_nodes: List[PageNode[PageNodeMetadata]] = Field(
        description="The pages in the document", default_factory=list, repr=False
    )
    metadata: DocumentNodeMetadata = Field(
        description="Application-specific metadata for the document",
        default_factory=BaseMetadata,
    )

    _locator: Optional["DocumentProvenanceLocator"] = PrivateAttr(default=None)

    _persistance_path: Optional[str] = PrivateAttr(default=None)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_locator"] = None

        return state

    def __len__(self):
        return len(self.page_nodes)

    def __getitem__(self, index):
        return self.page_nodes[index]

    def __iter__(self):
        return iter(self.page_nodes)

    @property
    def rasterizer(self):
        return DocumentRasterizer(self)

    @property
    def locator(self):
        if self._locator is None:
            self.refresh_locator()

        return self._locator

    def refresh_locator(self):
        """
        Refreshes the locator for this document node
        """
        from docprompt.provenance.search import DocumentProvenanceLocator

        if any(not page.ocr_results.result for page in self.page_nodes):
            raise ValueError(
                "Cannot create a locator for a document node with missing OCR results"
            )

        self._locator = DocumentProvenanceLocator.from_document_node(self)

        return self.locator

    @classmethod
    def from_document(
        cls,
        document: PdfDocument,
        document_metadata: Optional[DocumentNodeMetadata] = None,
        page_metadata: Optional[List[PageNodeMetadata]] = None,
    ):
        document_node: "DocumentNode[DocumentNodeMetadata, PageNodeMetadata]" = cls(
            document=document,
        )
        document_node.metadata = document_metadata or cls.metadata_class().from_owner(
            document_node, **{}
        )

        if page_metadata is not None and len(page_metadata) != len(document):
            raise ValueError(
                "The number of page metadata items must match the number of pages in the document."
            )

        for page_number in range(1, len(document) + 1):
            if page_metadata is not None:
                page_node = PageNode(
                    document=document_node,
                    page_number=page_number,
                    metadata=page_metadata[page_number - 1],
                )
            else:
                page_node = PageNode(document=document_node, page_number=page_number)

            document_node.page_nodes.append(page_node)

        return document_node

    @property
    def file_hash(self):
        return self.document.document_hash

    @property
    def document_name(self):
        return self.document.name

    @classmethod
    def metadata_class(cls) -> Type[BaseMetadata]:
        """Get the metadata class for instantiating metadata from the model."""

        fields = cls.model_fields

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        metadata_field_annotation = fields["metadata"].annotation

        # If no override has been provided to the metadata model, we want to retrieve
        # it as a TypedDict
        if metadata_field_annotation == DocumentNodeMetadata:
            return BaseMetadata

        if isinstance(metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
            )

        # Get the overriden Generic type of th DocumentNodeMetadata
        return metadata_field_annotation

    @classmethod
    def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
        """Get the metadata class for the page nodes in the document."""
        fields = cls.model_fields

        # NOTE: The indexing is important here, and it allows us to get the type of each
        # page node in the `List` annotation
        page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
            "metadata"
        ].annotation

        if page_node_metadata_field_annotation == PageNodeMetadata:
            return BaseMetadata

        if isinstance(page_node_metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define PageNode with a ForwardRef for Generic metadata model types."
            )

        return page_node_metadata_field_annotation

    @property
    def persistance_path(self):
        """The base path to storage location."""
        return self._persistance_path

    @persistance_path.setter
    def persistance_path(self, path: str):
        """Set the base path to storage location."""
        self._persistance_path = path

    @classmethod
    def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
        """Load the document node from storage.

        Args:
            path (str): The base path to storage location.
                - Example (S3): "s3://bucket-name/key/to/folder"
                - Example (Local FS): "/tmp/docprompt/storage"
            file_hash (str): The hash of the document.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            DocumentNode: The loaded document node.
        """

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
            file_hash, **kwargs
        )

        doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
        node = cls.from_document(doc)

        if metadata_bytes:
            metadata_json = json.loads(metadata_bytes.decode("utf-8"))
            metadata = cls.metadata_class().from_owner(node, **metadata_json)
        else:
            metadata = cls.metadata_class().from_owner(node, **{})

        if page_metadata_bytes:
            page_metadata_json = [
                json.loads(page_str)
                for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
            ]
            page_metadata = [
                cls.page_metadata_class()(**page) for page in page_metadata_json
            ]
        else:
            page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

        # Store the metadata on the node and page nodes
        node.metadata = metadata
        for page, meta in zip(node.page_nodes, page_metadata):
            meta.set_owner(page)
            page.metadata = meta

        # Make sure to set the persistance path on the node
        node.persistance_path = path

        return node

    def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
        """Persist a document node to storage.

        Args:
            path (Optional[str]): Overwrites the current `persistance_path` property
                - If `persistance_path` is not currently set, path must be provided.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            FileSidecarsPathManager: The file path manager for the persisted document node.
        """

        path = path or self.persistance_path

        if path is None:
            raise ValueError("The path must be provided to persist the document node.")

        # Make sure to update the persistance path
        self.persistance_path = path

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes = self.document.get_bytes()
        metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
        page_metadata_bytes = bytes(
            json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
            encoding="utf-8",
        )

        return fs_manager.write(
            pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
        )

persistance_path property writable

The base path to storage location.

from_storage(path, file_hash, **kwargs) classmethod

Load the document node from storage.

Parameters:

Name Type Description Default
path str

The base path to storage location. - Example (S3): "s3://bucket-name/key/to/folder" - Example (Local FS): "/tmp/docprompt/storage"

required
file_hash str

The hash of the document.

required
**kwargs

Additional keyword arguments for fsspec FileSystem

{}

Returns:

Name Type Description
DocumentNode Self

The loaded document node.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
    """Load the document node from storage.

    Args:
        path (str): The base path to storage location.
            - Example (S3): "s3://bucket-name/key/to/folder"
            - Example (Local FS): "/tmp/docprompt/storage"
        file_hash (str): The hash of the document.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        DocumentNode: The loaded document node.
    """

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
        file_hash, **kwargs
    )

    doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
    node = cls.from_document(doc)

    if metadata_bytes:
        metadata_json = json.loads(metadata_bytes.decode("utf-8"))
        metadata = cls.metadata_class().from_owner(node, **metadata_json)
    else:
        metadata = cls.metadata_class().from_owner(node, **{})

    if page_metadata_bytes:
        page_metadata_json = [
            json.loads(page_str)
            for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
        ]
        page_metadata = [
            cls.page_metadata_class()(**page) for page in page_metadata_json
        ]
    else:
        page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

    # Store the metadata on the node and page nodes
    node.metadata = metadata
    for page, meta in zip(node.page_nodes, page_metadata):
        meta.set_owner(page)
        page.metadata = meta

    # Make sure to set the persistance path on the node
    node.persistance_path = path

    return node

metadata_class() classmethod

Get the metadata class for instantiating metadata from the model.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def metadata_class(cls) -> Type[BaseMetadata]:
    """Get the metadata class for instantiating metadata from the model."""

    fields = cls.model_fields

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    metadata_field_annotation = fields["metadata"].annotation

    # If no override has been provided to the metadata model, we want to retrieve
    # it as a TypedDict
    if metadata_field_annotation == DocumentNodeMetadata:
        return BaseMetadata

    if isinstance(metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
        )

    # Get the overriden Generic type of th DocumentNodeMetadata
    return metadata_field_annotation

page_metadata_class() classmethod

Get the metadata class for the page nodes in the document.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
    """Get the metadata class for the page nodes in the document."""
    fields = cls.model_fields

    # NOTE: The indexing is important here, and it allows us to get the type of each
    # page node in the `List` annotation
    page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
        "metadata"
    ].annotation

    if page_node_metadata_field_annotation == PageNodeMetadata:
        return BaseMetadata

    if isinstance(page_node_metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define PageNode with a ForwardRef for Generic metadata model types."
        )

    return page_node_metadata_field_annotation

persist(path=None, **kwargs)

Persist a document node to storage.

Parameters:

Name Type Description Default
path Optional[str]

Overwrites the current persistance_path property - If persistance_path is not currently set, path must be provided.

None
**kwargs

Additional keyword arguments for fsspec FileSystem

{}

Returns:

Name Type Description
FileSidecarsPathManager FileSidecarsPathManager

The file path manager for the persisted document node.

Source code in docprompt/schema/pipeline/node/document.py
def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
    """Persist a document node to storage.

    Args:
        path (Optional[str]): Overwrites the current `persistance_path` property
            - If `persistance_path` is not currently set, path must be provided.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        FileSidecarsPathManager: The file path manager for the persisted document node.
    """

    path = path or self.persistance_path

    if path is None:
        raise ValueError("The path must be provided to persist the document node.")

    # Make sure to update the persistance path
    self.persistance_path = path

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes = self.document.get_bytes()
    metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
    page_metadata_bytes = bytes(
        json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
        encoding="utf-8",
    )

    return fs_manager.write(
        pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
    )

refresh_locator()

Refreshes the locator for this document node

Source code in docprompt/schema/pipeline/node/document.py
def refresh_locator(self):
    """
    Refreshes the locator for this document node
    """
    from docprompt.provenance.search import DocumentProvenanceLocator

    if any(not page.ocr_results.result for page in self.page_nodes):
        raise ValueError(
            "Cannot create a locator for a document node with missing OCR results"
        )

    self._locator = DocumentProvenanceLocator.from_document_node(self)

    return self.locator

PageNode

Bases: BaseNode, Generic[PageNodeMetadata]

Represents a single page in a document, with some metadata

Source code in docprompt/schema/pipeline/node/page.py
class PageNode(BaseNode, Generic[PageNodeMetadata]):
    """
    Represents a single page in a document, with some metadata
    """

    document: "DocumentNode" = Field(exclude=True, repr=False)
    page_number: PositiveInt = Field(description="The page number")
    metadata: PageNodeMetadata = Field(
        description="Application-specific metadata for the page",
        default_factory=BaseMetadata,
    )
    extra: Dict[str, Any] = Field(
        description="Extra data that can be stored on the page node",
        default_factory=dict,
    )

    ocr_results: ResultContainer[OcrPageResult] = Field(
        default_factory=_result_container_factory,
        description="The OCR results for the page",
        repr=False,
    )

    _raster_cache: Dict[str, bytes] = PrivateAttr(default_factory=dict)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_raster_cache"] = {}

        return state

    @property
    def rasterizer(self):
        return PageRasterizer(self._raster_cache, self)

    def search(
        self, query: str, refine_to_words: bool = True, require_exact_match: bool = True
    ):
        return self.document.locator.search(
            query,
            page_number=self.page_number,
            refine_to_word=refine_to_words,
            require_exact_match=require_exact_match,
        )

base

BaseNode

Bases: BaseModel

The base node class is utilized for defining a basic yet flexible interface

Source code in docprompt/schema/pipeline/node/base.py
class BaseNode(BaseModel):
    """The base node class is utilized for defining a basic yet flexible interface"""

collection

DocumentCollection

Bases: BaseModel, Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata]

Represents a collection of documents with some common metadata

Source code in docprompt/schema/pipeline/node/collection.py
class DocumentCollection(
    BaseModel,
    Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata],
):
    """
    Represents a collection of documents with some common metadata
    """

    document_nodes: List["DocumentNode[DocumentNodeMetadata, PageNodeMetadata]"]
    metadata: DocumentCollectionMetadata = Field(..., default_factory=dict)

document

DocumentNode

Bases: BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]

Represents a single document, with some metadata

Source code in docprompt/schema/pipeline/node/document.py
class DocumentNode(BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]):
    """
    Represents a single document, with some metadata
    """

    document: PdfDocument
    page_nodes: List[PageNode[PageNodeMetadata]] = Field(
        description="The pages in the document", default_factory=list, repr=False
    )
    metadata: DocumentNodeMetadata = Field(
        description="Application-specific metadata for the document",
        default_factory=BaseMetadata,
    )

    _locator: Optional["DocumentProvenanceLocator"] = PrivateAttr(default=None)

    _persistance_path: Optional[str] = PrivateAttr(default=None)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_locator"] = None

        return state

    def __len__(self):
        return len(self.page_nodes)

    def __getitem__(self, index):
        return self.page_nodes[index]

    def __iter__(self):
        return iter(self.page_nodes)

    @property
    def rasterizer(self):
        return DocumentRasterizer(self)

    @property
    def locator(self):
        if self._locator is None:
            self.refresh_locator()

        return self._locator

    def refresh_locator(self):
        """
        Refreshes the locator for this document node
        """
        from docprompt.provenance.search import DocumentProvenanceLocator

        if any(not page.ocr_results.result for page in self.page_nodes):
            raise ValueError(
                "Cannot create a locator for a document node with missing OCR results"
            )

        self._locator = DocumentProvenanceLocator.from_document_node(self)

        return self.locator

    @classmethod
    def from_document(
        cls,
        document: PdfDocument,
        document_metadata: Optional[DocumentNodeMetadata] = None,
        page_metadata: Optional[List[PageNodeMetadata]] = None,
    ):
        document_node: "DocumentNode[DocumentNodeMetadata, PageNodeMetadata]" = cls(
            document=document,
        )
        document_node.metadata = document_metadata or cls.metadata_class().from_owner(
            document_node, **{}
        )

        if page_metadata is not None and len(page_metadata) != len(document):
            raise ValueError(
                "The number of page metadata items must match the number of pages in the document."
            )

        for page_number in range(1, len(document) + 1):
            if page_metadata is not None:
                page_node = PageNode(
                    document=document_node,
                    page_number=page_number,
                    metadata=page_metadata[page_number - 1],
                )
            else:
                page_node = PageNode(document=document_node, page_number=page_number)

            document_node.page_nodes.append(page_node)

        return document_node

    @property
    def file_hash(self):
        return self.document.document_hash

    @property
    def document_name(self):
        return self.document.name

    @classmethod
    def metadata_class(cls) -> Type[BaseMetadata]:
        """Get the metadata class for instantiating metadata from the model."""

        fields = cls.model_fields

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        metadata_field_annotation = fields["metadata"].annotation

        # If no override has been provided to the metadata model, we want to retrieve
        # it as a TypedDict
        if metadata_field_annotation == DocumentNodeMetadata:
            return BaseMetadata

        if isinstance(metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
            )

        # Get the overriden Generic type of th DocumentNodeMetadata
        return metadata_field_annotation

    @classmethod
    def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
        """Get the metadata class for the page nodes in the document."""
        fields = cls.model_fields

        # NOTE: The indexing is important here, and it allows us to get the type of each
        # page node in the `List` annotation
        page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
            "metadata"
        ].annotation

        if page_node_metadata_field_annotation == PageNodeMetadata:
            return BaseMetadata

        if isinstance(page_node_metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define PageNode with a ForwardRef for Generic metadata model types."
            )

        return page_node_metadata_field_annotation

    @property
    def persistance_path(self):
        """The base path to storage location."""
        return self._persistance_path

    @persistance_path.setter
    def persistance_path(self, path: str):
        """Set the base path to storage location."""
        self._persistance_path = path

    @classmethod
    def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
        """Load the document node from storage.

        Args:
            path (str): The base path to storage location.
                - Example (S3): "s3://bucket-name/key/to/folder"
                - Example (Local FS): "/tmp/docprompt/storage"
            file_hash (str): The hash of the document.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            DocumentNode: The loaded document node.
        """

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
            file_hash, **kwargs
        )

        doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
        node = cls.from_document(doc)

        if metadata_bytes:
            metadata_json = json.loads(metadata_bytes.decode("utf-8"))
            metadata = cls.metadata_class().from_owner(node, **metadata_json)
        else:
            metadata = cls.metadata_class().from_owner(node, **{})

        if page_metadata_bytes:
            page_metadata_json = [
                json.loads(page_str)
                for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
            ]
            page_metadata = [
                cls.page_metadata_class()(**page) for page in page_metadata_json
            ]
        else:
            page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

        # Store the metadata on the node and page nodes
        node.metadata = metadata
        for page, meta in zip(node.page_nodes, page_metadata):
            meta.set_owner(page)
            page.metadata = meta

        # Make sure to set the persistance path on the node
        node.persistance_path = path

        return node

    def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
        """Persist a document node to storage.

        Args:
            path (Optional[str]): Overwrites the current `persistance_path` property
                - If `persistance_path` is not currently set, path must be provided.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            FileSidecarsPathManager: The file path manager for the persisted document node.
        """

        path = path or self.persistance_path

        if path is None:
            raise ValueError("The path must be provided to persist the document node.")

        # Make sure to update the persistance path
        self.persistance_path = path

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes = self.document.get_bytes()
        metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
        page_metadata_bytes = bytes(
            json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
            encoding="utf-8",
        )

        return fs_manager.write(
            pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
        )
persistance_path property writable

The base path to storage location.

from_storage(path, file_hash, **kwargs) classmethod

Load the document node from storage.

Parameters:

Name Type Description Default
path str

The base path to storage location. - Example (S3): "s3://bucket-name/key/to/folder" - Example (Local FS): "/tmp/docprompt/storage"

required
file_hash str

The hash of the document.

required
**kwargs

Additional keyword arguments for fsspec FileSystem

{}

Returns:

Name Type Description
DocumentNode Self

The loaded document node.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
    """Load the document node from storage.

    Args:
        path (str): The base path to storage location.
            - Example (S3): "s3://bucket-name/key/to/folder"
            - Example (Local FS): "/tmp/docprompt/storage"
        file_hash (str): The hash of the document.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        DocumentNode: The loaded document node.
    """

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
        file_hash, **kwargs
    )

    doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
    node = cls.from_document(doc)

    if metadata_bytes:
        metadata_json = json.loads(metadata_bytes.decode("utf-8"))
        metadata = cls.metadata_class().from_owner(node, **metadata_json)
    else:
        metadata = cls.metadata_class().from_owner(node, **{})

    if page_metadata_bytes:
        page_metadata_json = [
            json.loads(page_str)
            for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
        ]
        page_metadata = [
            cls.page_metadata_class()(**page) for page in page_metadata_json
        ]
    else:
        page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

    # Store the metadata on the node and page nodes
    node.metadata = metadata
    for page, meta in zip(node.page_nodes, page_metadata):
        meta.set_owner(page)
        page.metadata = meta

    # Make sure to set the persistance path on the node
    node.persistance_path = path

    return node
metadata_class() classmethod

Get the metadata class for instantiating metadata from the model.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def metadata_class(cls) -> Type[BaseMetadata]:
    """Get the metadata class for instantiating metadata from the model."""

    fields = cls.model_fields

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    metadata_field_annotation = fields["metadata"].annotation

    # If no override has been provided to the metadata model, we want to retrieve
    # it as a TypedDict
    if metadata_field_annotation == DocumentNodeMetadata:
        return BaseMetadata

    if isinstance(metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
        )

    # Get the overriden Generic type of th DocumentNodeMetadata
    return metadata_field_annotation
page_metadata_class() classmethod

Get the metadata class for the page nodes in the document.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
    """Get the metadata class for the page nodes in the document."""
    fields = cls.model_fields

    # NOTE: The indexing is important here, and it allows us to get the type of each
    # page node in the `List` annotation
    page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
        "metadata"
    ].annotation

    if page_node_metadata_field_annotation == PageNodeMetadata:
        return BaseMetadata

    if isinstance(page_node_metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define PageNode with a ForwardRef for Generic metadata model types."
        )

    return page_node_metadata_field_annotation
persist(path=None, **kwargs)

Persist a document node to storage.

Parameters:

Name Type Description Default
path Optional[str]

Overwrites the current persistance_path property - If persistance_path is not currently set, path must be provided.

None
**kwargs

Additional keyword arguments for fsspec FileSystem

{}

Returns:

Name Type Description
FileSidecarsPathManager FileSidecarsPathManager

The file path manager for the persisted document node.

Source code in docprompt/schema/pipeline/node/document.py
def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
    """Persist a document node to storage.

    Args:
        path (Optional[str]): Overwrites the current `persistance_path` property
            - If `persistance_path` is not currently set, path must be provided.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        FileSidecarsPathManager: The file path manager for the persisted document node.
    """

    path = path or self.persistance_path

    if path is None:
        raise ValueError("The path must be provided to persist the document node.")

    # Make sure to update the persistance path
    self.persistance_path = path

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes = self.document.get_bytes()
    metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
    page_metadata_bytes = bytes(
        json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
        encoding="utf-8",
    )

    return fs_manager.write(
        pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
    )
refresh_locator()

Refreshes the locator for this document node

Source code in docprompt/schema/pipeline/node/document.py
def refresh_locator(self):
    """
    Refreshes the locator for this document node
    """
    from docprompt.provenance.search import DocumentProvenanceLocator

    if any(not page.ocr_results.result for page in self.page_nodes):
        raise ValueError(
            "Cannot create a locator for a document node with missing OCR results"
        )

    self._locator = DocumentProvenanceLocator.from_document_node(self)

    return self.locator

page

PageNode

Bases: BaseNode, Generic[PageNodeMetadata]

Represents a single page in a document, with some metadata

Source code in docprompt/schema/pipeline/node/page.py
class PageNode(BaseNode, Generic[PageNodeMetadata]):
    """
    Represents a single page in a document, with some metadata
    """

    document: "DocumentNode" = Field(exclude=True, repr=False)
    page_number: PositiveInt = Field(description="The page number")
    metadata: PageNodeMetadata = Field(
        description="Application-specific metadata for the page",
        default_factory=BaseMetadata,
    )
    extra: Dict[str, Any] = Field(
        description="Extra data that can be stored on the page node",
        default_factory=dict,
    )

    ocr_results: ResultContainer[OcrPageResult] = Field(
        default_factory=_result_container_factory,
        description="The OCR results for the page",
        repr=False,
    )

    _raster_cache: Dict[str, bytes] = PrivateAttr(default_factory=dict)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_raster_cache"] = {}

        return state

    @property
    def rasterizer(self):
        return PageRasterizer(self._raster_cache, self)

    def search(
        self, query: str, refine_to_words: bool = True, require_exact_match: bool = True
    ):
        return self.document.locator.search(
            query,
            page_number=self.page_number,
            refine_to_word=refine_to_words,
            require_exact_match=require_exact_match,
        )

typing

rasterizer

DocumentRasterizer

Source code in docprompt/schema/pipeline/rasterizer.py
class DocumentRasterizer:
    def __init__(self, owner: "DocumentNode"):
        self.owner = owner

    def rasterize(
        self,
        name: str,
        *,
        return_mode: Literal["bytes", "pil"] = "bytes",
        dpi: int = 100,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        max_file_size_bytes: Optional[int] = None,
        render_grayscale: bool = False,
    ) -> List[Union[bytes, Image.Image]]:
        images = self.owner.document.rasterize_pdf(
            dpi=dpi,
            downscale_size=downscale_size,
            resize_mode=resize_mode,
            resize_aspect_ratios=resize_aspect_ratios,
            do_convert=do_convert,
            image_convert_mode=image_convert_mode,
            do_quantize=do_quantize,
            quantize_color_count=quantize_color_count,
            max_file_size_bytes=max_file_size_bytes,
            render_grayscale=render_grayscale,
            return_mode=return_mode,
        )

        for page_number, image in images.items():
            page_node = self.owner.page_nodes[page_number - 1]

            page_node._raster_cache[name] = image

        return list(images.values())

    def propagate_cache(self, name: str, rasters: Dict[int, Union[bytes, Image.Image]]):
        """
        Should be one-indexed
        """
        for page_number, raster in rasters.items():
            page_node = self.owner.page_nodes[page_number - 1]

            page_node._raster_cache[name] = raster

propagate_cache(name, rasters)

Should be one-indexed

Source code in docprompt/schema/pipeline/rasterizer.py
def propagate_cache(self, name: str, rasters: Dict[int, Union[bytes, Image.Image]]):
    """
    Should be one-indexed
    """
    for page_number, raster in rasters.items():
        page_node = self.owner.page_nodes[page_number - 1]

        page_node._raster_cache[name] = raster