Skip to content




Bases: BaseModel

Represents a PDF document

Source code in docprompt/schema/
class PdfDocument(BaseModel):
    Represents a PDF document

    name: str = Field(description="The name of the document")
    file_bytes: bytes = Field(description="The bytes of the document", repr=False)
    file_path: Optional[str] = None

    password: Optional[SecretStr] = None

    def __len__(self):
        return self.num_pages

    def __hash__(self):
        return hash(self.document_hash)

    def page_count(self) -> PositiveInt:
        from docprompt.utils.util import get_page_count

        return get_page_count(self.file_bytes)

    def num_pages(self):
        return self.page_count

    def bytes_per_page(self):
        return len(self.file_bytes) / self.num_pages

    def document_hash(self) -> str:
        from docprompt.utils.util import hash_from_bytes

        return hash_from_bytes(self.file_bytes)

    def serialize_file_bytes(self, v: bytes, _info):
        compressed = gzip.compress(v)

        return base64.b64encode(compressed).decode("utf-8")

    def validate_file_bytes(cls, v: bytes):
        if not isinstance(v, bytes):
            raise ValueError("File bytes must be bytes")

        if len(v) == 0:
            raise ValueError("File bytes must not be empty")

        if filetype.guess_mime(v) == "text/plain":
            v = base64.b64decode(v, validate=True)

        if filetype.guess_mime(v) == "application/gzip":
            v = gzip.decompress(v)

        if filetype.guess_mime(v) != "application/pdf":
            raise ValueError("File bytes must be a PDF")

        return v

    def from_path(cls, file_path: Union[PathLike, str]):
        file_path = Path(file_path)

        if not file_path.is_file():
            raise ValueError(f"File path {file_path} is not a file")

        file_bytes = file_path.read_bytes()

        return cls(, file_path=str(file_path), file_bytes=file_bytes)

    def from_bytes(cls, file_bytes: bytes, name: Optional[str] = None):
        if name is None:
            name = f"PDF-{}.pdf"

        return cls(name=name, file_bytes=file_bytes)

    def get_bytes(self) -> bytes:
        return self.file_bytes  # Deprecated

    def path(self):
        return self.file_path

    def get_page_render_size(
        self, page_number: int, dpi: int = DEFAULT_DPI
    ) -> Tuple[int, int]:
        Returns the render size of a page in pixels
        return get_page_render_size_from_bytes(self.get_bytes(), page_number, dpi=dpi)

    def to_compressed_bytes(self, compression_kwargs: dict = {}) -> bytes:
        Compresses the document using Ghostscript
        with self.as_tempfile() as temp_path:
            return compress_pdf_to_bytes(temp_path, **compression_kwargs)

    def rasterize_page(
        page_number: int,
        dpi: int = DEFAULT_DPI,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        max_file_size_bytes: Optional[int] = None,
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        return_mode: Literal["pil", "bytes"] = "bytes",
        Rasterizes a page of the document using Pdfium
        if page_number <= 0 or page_number > self.num_pages:
            raise ValueError(f"Page number must be between 0 and {self.num_pages}")

        post_process_fn = None

        if any(
            post_process_fn = partial(
                resize_width=downscale_size[0] if downscale_size else None,
                resize_height=downscale_size[1] if downscale_size else None,

        rastered = rasterize_page_with_pdfium(
            scale=(1 / 72) * dpi,

        return rastered

    def rasterize_page_to_data_uri(
        page_number: int,
        dpi: int = DEFAULT_DPI,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        max_file_size_bytes: Optional[int] = None,
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        render_grayscale: bool = False,
    ) -> str:
        Rasterizes a page of the document using Pdfium and returns a data URI, which can
        be embedded into HTML or passed to large language models
        image_bytes = self.rasterize_page(
        return f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"

    def rasterize_pdf(
        dpi: int = DEFAULT_DPI,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        max_file_size_bytes: Optional[int] = None,
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        return_mode: Literal["pil", "bytes"] = "bytes",
        render_grayscale: bool = False,
    ) -> Dict[int, bytes]:
        Rasterizes the entire document using Pdfium
        result = {}

        post_process_fn = None

        if any(
            post_process_fn = partial(
                resize_width=downscale_size[0] if downscale_size else None,
                resize_height=downscale_size[1] if downscale_size else None,

        for idx, rastered in enumerate(
                scale=(1 / 72) * dpi,
            result[idx + 1] = rastered

        return result

    def split(self, start: Optional[int] = None, stop: Optional[int] = None):
        Splits a document into multiple documents
        if start is None and stop is None:
            raise ValueError("Must specify either start or stop")

        start = start or 0

        from docprompt.utils.splitter import split_pdf_to_bytes

        split_bytes = split_pdf_to_bytes(
            self.file_bytes, start_page=start, stop_page=stop

        return Document.from_bytes(split_bytes,

    def as_tempfile(self, **kwargs):
        Returns a tempfile of the document

        def tempfile_context() -> Generator[str, None, None]:
            tempfile_kwargs = {"mode": "wb", "delete": True, "suffix": ".pdf", **kwargs}

            with tempfile.NamedTemporaryFile(**tempfile_kwargs) as f:

        return tempfile_context()

    def write_to_path(self, path: Union[PathLike, str], **kwargs):
        Writes the document to a path
        path = Path(path)

        if path.is_dir():
            path = path /

        with"wb") as f:


Returns a tempfile of the document

Source code in docprompt/schema/
def as_tempfile(self, **kwargs):
    Returns a tempfile of the document

    def tempfile_context() -> Generator[str, None, None]:
        tempfile_kwargs = {"mode": "wb", "delete": True, "suffix": ".pdf", **kwargs}

        with tempfile.NamedTemporaryFile(**tempfile_kwargs) as f:

    return tempfile_context()

get_page_render_size(page_number, dpi=DEFAULT_DPI)

Returns the render size of a page in pixels

Source code in docprompt/schema/
def get_page_render_size(
    self, page_number: int, dpi: int = DEFAULT_DPI
) -> Tuple[int, int]:
    Returns the render size of a page in pixels
    return get_page_render_size_from_bytes(self.get_bytes(), page_number, dpi=dpi)

rasterize_page(page_number, *, dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, return_mode='bytes')

Rasterizes a page of the document using Pdfium

Source code in docprompt/schema/
def rasterize_page(
    page_number: int,
    dpi: int = DEFAULT_DPI,
    downscale_size: Optional[Tuple[int, int]] = None,
    resize_mode: ResizeModes = "thumbnail",
    max_file_size_bytes: Optional[int] = None,
    resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
    do_convert: bool = False,
    image_convert_mode: str = "L",
    do_quantize: bool = False,
    quantize_color_count: int = 8,
    return_mode: Literal["pil", "bytes"] = "bytes",
    Rasterizes a page of the document using Pdfium
    if page_number <= 0 or page_number > self.num_pages:
        raise ValueError(f"Page number must be between 0 and {self.num_pages}")

    post_process_fn = None

    if any(
        post_process_fn = partial(
            resize_width=downscale_size[0] if downscale_size else None,
            resize_height=downscale_size[1] if downscale_size else None,

    rastered = rasterize_page_with_pdfium(
        scale=(1 / 72) * dpi,

    return rastered

rasterize_page_to_data_uri(page_number, *, dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, render_grayscale=False)

Rasterizes a page of the document using Pdfium and returns a data URI, which can be embedded into HTML or passed to large language models

Source code in docprompt/schema/
def rasterize_page_to_data_uri(
    page_number: int,
    dpi: int = DEFAULT_DPI,
    downscale_size: Optional[Tuple[int, int]] = None,
    resize_mode: ResizeModes = "thumbnail",
    max_file_size_bytes: Optional[int] = None,
    resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
    do_convert: bool = False,
    image_convert_mode: str = "L",
    do_quantize: bool = False,
    quantize_color_count: int = 8,
    render_grayscale: bool = False,
) -> str:
    Rasterizes a page of the document using Pdfium and returns a data URI, which can
    be embedded into HTML or passed to large language models
    image_bytes = self.rasterize_page(
    return f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"

rasterize_pdf(dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, return_mode='bytes', render_grayscale=False)

Rasterizes the entire document using Pdfium

Source code in docprompt/schema/
def rasterize_pdf(
    dpi: int = DEFAULT_DPI,
    downscale_size: Optional[Tuple[int, int]] = None,
    resize_mode: ResizeModes = "thumbnail",
    max_file_size_bytes: Optional[int] = None,
    resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
    do_convert: bool = False,
    image_convert_mode: str = "L",
    do_quantize: bool = False,
    quantize_color_count: int = 8,
    return_mode: Literal["pil", "bytes"] = "bytes",
    render_grayscale: bool = False,
) -> Dict[int, bytes]:
    Rasterizes the entire document using Pdfium
    result = {}

    post_process_fn = None

    if any(
        post_process_fn = partial(
            resize_width=downscale_size[0] if downscale_size else None,
            resize_height=downscale_size[1] if downscale_size else None,

    for idx, rastered in enumerate(
            scale=(1 / 72) * dpi,
        result[idx + 1] = rastered

    return result

split(start=None, stop=None)

Splits a document into multiple documents

Source code in docprompt/schema/
def split(self, start: Optional[int] = None, stop: Optional[int] = None):
    Splits a document into multiple documents
    if start is None and stop is None:
        raise ValueError("Must specify either start or stop")

    start = start or 0

    from docprompt.utils.splitter import split_pdf_to_bytes

    split_bytes = split_pdf_to_bytes(
        self.file_bytes, start_page=start, stop_page=stop

    return Document.from_bytes(split_bytes,


Compresses the document using Ghostscript

Source code in docprompt/schema/
def to_compressed_bytes(self, compression_kwargs: dict = {}) -> bytes:
    Compresses the document using Ghostscript
    with self.as_tempfile() as temp_path:
        return compress_pdf_to_bytes(temp_path, **compression_kwargs)

write_to_path(path, **kwargs)

Writes the document to a path

Source code in docprompt/schema/
def write_to_path(self, path: Union[PathLike, str], **kwargs):
    Writes the document to a path
    path = Path(path)

    if path.is_dir():
        path = path /

    with"wb") as f:

get_page_render_size_from_bytes(file_bytes, page_number, dpi=DEFAULT_DPI)

Returns the render size of a page in pixels

Source code in docprompt/schema/
def get_page_render_size_from_bytes(
    file_bytes: bytes, page_number: int, dpi: int = DEFAULT_DPI
    Returns the render size of a page in pixels

    with get_pdfium_document(file_bytes) as pdf:
        page = pdf.get_page(page_number)

        mediabox = page.get_mediabox()

        base_width = int(mediabox[2] - mediabox[0])
        base_height = int(mediabox[3] - mediabox[1])

        width = int(base_width * dpi / 72)
        height = int(base_height * dpi / 72)

        return width, height



Bases: BaseModel

Represents a normalized bounding poly with each value in the range [0, 1]

Used for higher order shapes like polygons on a page

Source code in docprompt/schema/
class BoundingPoly(BaseModel):
    Represents a normalized bounding poly with each value in the range [0, 1]

    Used for higher order shapes like polygons on a page

    normalized_vertices: List[Point]

    def __getitem__(self, index):
        return self.normalized_vertices[index]


Bases: BaseModel

Represents a normalized bounding box with each value in the range [0, 1]

Where x1 > x0 and bottom > top

Source code in docprompt/schema/
class NormBBox(BaseModel):
    Represents a normalized bounding box with each value in the range [0, 1]

    Where x1 > x0 and bottom > top

    x0: BoundedFloat
    top: BoundedFloat
    x1: BoundedFloat
    bottom: BoundedFloat

    model_config: ConfigDict = {"json_encoders": {float: lambda v: round(v, 5)}}

    def as_tuple(self):
        return (self.x0,, self.x1, self.bottom)

    def __getitem__(self, index):
        # Lots of if statements to prevent new allocations
        if index > 3:
            raise IndexError("Index out of range")

        if index == 0:
            return self.x0
        elif index == 1:
        elif index == 2:
            return self.x1
        elif index == 3:
            return self.bottom

    def __eq__(self, other):
        if not isinstance(other, NormBBox):
            return False

        return self.as_tuple() == other.as_tuple()

    def __hash__(self):
        return hash(self.as_tuple())

    def __and__(self, other):
        if not isinstance(other, NormBBox):
            raise TypeError("Can only compute intersection with NormBBox")
        # Compute the intersection of two bounding boxes
        new_x0 = max(self.x0, other.x0)
        new_top = max(,
        new_x1 = min(self.x1, other.x1)
        new_bottom = min(self.bottom, other.bottom)

        # Check if there is an actual intersection and if the resulting bounding box is valid
        if new_x0 <= new_x1 and new_top <= new_bottom:
            return NormBBox(x0=new_x0, top=new_top, x1=new_x1, bottom=new_bottom)
            # Return an empty or non-existent bounding box representation
            return None

    def __add__(self, other):
        if not isinstance(other, NormBBox):
            raise TypeError("Can only add NormBBox to NormBBox")

        return NormBBox(
            x0=min(self.x0, other.x0),
            x1=max(self.x1, other.x1),
            bottom=max(self.bottom, other.bottom),

    def __contains__(self, other):
        return (
            self.x0 <= other.x0
            and <=
            and self.x1 >= other.x1
            and self.bottom >= other.bottom

    def intersection_over_union(self, other):
        if not isinstance(other, NormBBox):
            raise TypeError("Can only compute IOU with NormBBox")

        # Compute the intersection
        intersection_bbox = self & other

        if intersection_bbox:
            intersection_area = intersection_bbox.area
            union_area = self.area + other.area - intersection_area
            return intersection_area / union_area

        return 0  # No intersection

    def x_overlap(self, other):
        Get the overlap, between 0 and 1, of the x-axis of two bounding boxes
        return max(0, min(self.x1, other.x1) - max(self.x0, other.x0))

    def y_overlap(self, other):
        Get the overlap, between 0 and 1, of the y-axis of two bounding boxes
        return max(0, min(self.bottom, other.bottom) - max(,

    def combine(cls, *bboxes: "NormBBox"):
        Combines multiple bounding boxes into a single bounding box
        if len(bboxes) == 0:
            raise ValueError("Must provide at least one bounding box")

        if len(bboxes) == 1:
            return bboxes[0]

        working_bbox = bboxes[0]
        for bbox in bboxes[1:]:
            working_bbox = working_bbox + bbox

        return working_bbox

    def from_bounding_poly(cls, bounding_poly: "BoundingPoly"):
        Returns a NormBBox from a BoundingPoly
        if len(bounding_poly.normalized_vertices) != 4:
            raise ValueError(
                "BoundingPoly must have 4 vertices for NormBBox conversion"

        ) = bounding_poly.normalized_vertices

        return cls(

    def width(self):
        return self.x1 - self.x0

    def height(self):
        return self.bottom -

    def area(self):
        return self.width * self.height

    def centroid(self):
        return (self.x0 + self.x1) / 2, ( + self.bottom) / 2

    def y_center(self):
        return ( + self.bottom) / 2

    def x_center(self):
        return (self.x0 + self.x1) / 2

combine(*bboxes) classmethod

Combines multiple bounding boxes into a single bounding box

Source code in docprompt/schema/
def combine(cls, *bboxes: "NormBBox"):
    Combines multiple bounding boxes into a single bounding box
    if len(bboxes) == 0:
        raise ValueError("Must provide at least one bounding box")

    if len(bboxes) == 1:
        return bboxes[0]

    working_bbox = bboxes[0]
    for bbox in bboxes[1:]:
        working_bbox = working_bbox + bbox

    return working_bbox

from_bounding_poly(bounding_poly) classmethod

Returns a NormBBox from a BoundingPoly

Source code in docprompt/schema/
def from_bounding_poly(cls, bounding_poly: "BoundingPoly"):
    Returns a NormBBox from a BoundingPoly
    if len(bounding_poly.normalized_vertices) != 4:
        raise ValueError(
            "BoundingPoly must have 4 vertices for NormBBox conversion"

    ) = bounding_poly.normalized_vertices

    return cls(


Get the overlap, between 0 and 1, of the x-axis of two bounding boxes

Source code in docprompt/schema/
def x_overlap(self, other):
    Get the overlap, between 0 and 1, of the x-axis of two bounding boxes
    return max(0, min(self.x1, other.x1) - max(self.x0, other.x0))


Get the overlap, between 0 and 1, of the y-axis of two bounding boxes

Source code in docprompt/schema/
def y_overlap(self, other):
    Get the overlap, between 0 and 1, of the y-axis of two bounding boxes
    return max(0, min(self.bottom, other.bottom) - max(,


Bases: BaseModel

Represents a normalized bounding box with each value in the range [0, 1]

Source code in docprompt/schema/
class Point(BaseModel):
    Represents a normalized bounding box with each value in the range [0, 1]

    model_config: ConfigDict = {"json_encoders": {float: lambda v: round(v, 5)}}

    x: BoundedFloat
    y: BoundedFloat


Bases: BaseModel

Represents a single block of text, with its bounding box. The bounding box is a tuple of (x0, top, x1, bottom) and is normalized to the page size.

Source code in docprompt/schema/
class TextBlock(BaseModel):
    Represents a single block of text, with its bounding box.
    The bounding box is a tuple of (x0, top, x1, bottom) and
    is normalized to the page size.

    model_config: ConfigDict = {"json_encoders": {float: lambda v: round(v, 5)}}

    text: str
    type: SegmentLevels
    source: TextblockSource = Field(
        default="derived", description="The source of the text block"

    # Layout information
    bounding_box: NormBBox = Field(default=None, repr=False)
    bounding_poly: Optional[BoundingPoly] = Field(default=None, repr=False)
    text_spans: Optional[List[TextSpan]] = Field(default=None, repr=False)

    metadata: Optional[TextBlockMetadata] = Field(default_factory=TextBlockMetadata)

    def __getitem__(self, index):
        return getattr(self, index)

    def __hash__(self):
        return hash((self.text, self.bounding_box.as_tuple()))

    def confidence(self):
        return self.metadata.confidence

    def direction(self):
        return self.metadata.direction



Bases: BaseModel, MutableMapping, Generic[TMetadataOwner]

The base metadata class is utilized for defining a basic yet flexible interface for metadata attached to various fields.

When used out of the box, the metadata class will adobpt dictionary-like behavior. You may easily access different fields of the metadata as if it were a dictionary:

# Instantiate it with any kwargs you like
metadata = BaseMetadata(foo-'bar', cow='moo')

metadata["foo"]  # "bar"
metadata["cow"]  # "moo"

# Update the value of the key
metadata["foo"] = "fighters"

# Set new key-value pairs
metadata['sheep'] = 'baa'

Otherwise, you may sub-class the metadata class in order to create a more strictly typed metadata model. This is useful when you want to enforce a specific structure for your metadata.

class CustomMetadata(BaseMetadata):
    foo: str
    cow: str

# Instantiate it with the required fields
metadata = CustomMetadata(foo='bar', cow='moo')  # "bar"
metadata.cow  # "moo"

# Update the value of the key = "fighters"

# Use the extra field to store dynamic metadata
metadata.extra['sheep'] = 'baa'

Additionally, the task results descriptor allows for controlled and easy access to the task results of various tasks that are run on the parent node.

Source code in docprompt/schema/pipeline/
class BaseMetadata(BaseModel, MutableMapping, Generic[TMetadataOwner]):
    The base metadata class is utilized for defining a basic yet flexible interface
    for metadata attached to various fields.

    The metadata class can be used in two ways:
        1. As a dictionary-like object, where metadata is stored in the `extra` field.
        2. As a sub-classed model, where metadata is stored in the fields of the model.

    When used out of the box, the metadata class will adobpt dictionary-like behavior. You
    may easily access different fields of the metadata as if it were a dictionary:
    # Instantiate it with any kwargs you like
    metadata = BaseMetadata(foo-'bar', cow='moo')

    metadata["foo"]  # "bar"
    metadata["cow"]  # "moo"

    # Update the value of the key
    metadata["foo"] = "fighters"

    # Set new key-value pairs
    metadata['sheep'] = 'baa'

    Otherwise, you may sub-class the metadata class in order to create a more strictly typed
    metadata model. This is useful when you want to enforce a specific structure for your metadata.

    class CustomMetadata(BaseMetadata):
        foo: str
        cow: str

    # Instantiate it with the required fields
    metadata = CustomMetadata(foo='bar', cow='moo')  # "bar"
    metadata.cow  # "moo"

    # Update the value of the key = "fighters"

    # Use the extra field to store dynamic metadata
    metadata.extra['sheep'] = 'baa'

    Additionally, the task results descriptor allows for controlled and easy access to the task results
    of various tasks that are run on the parent node.

    extra: Dict[str, Any] = Field(..., default_factory=dict, repr=False)

    _task_results: TaskResultsDescriptor = PrivateAttr(

    _owner: TMetadataOwner = PrivateAttr()

    def task_results(self) -> TaskResultsDescriptor:
        """Return the task results descriptor."""
        return self._task_results.__get__(self)

    def task_results(self, value: Any) -> None:
        """This will raise an error, as we do not want to set the task results directly.

        NOTE: This implementation is here purely to avoid the task_results property from being
        overwritten by accident.
        self._task_results.__set__(self, value)

    def owner(self) -> TMetadataOwner:
        """Return the owner of the metadata."""
        return self._owner

    def owner(self, owner: TMetadataOwner) -> None:
        """Return the owner of the metadata."""
        self._owner = owner

    def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
        """Create a new instance of the metadata class with the owner set."""
        metadata = cls(**data)
        metadata.owner = owner
        return metadata

    def validate_data_fields_from_annotations(cls, data: Any) -> Any:
        """Validate the data fields from the annotations."""

        # We want to make sure that we combine the `extra` metdata along with any
        # other specific fields that are defined in the metadata.
        extra = data.pop("extra", {})
        assert isinstance(extra, dict), "The `extra` field must be a dictionary."
        data = {**data, **extra}

        # If the model has been sub-classed, then all of our fields must be
        # validated by the pydantic model.
        if cls._is_field_typed():
            # We will get the fields out of extra and set them as potential fields to
            # validate. They will be ignored if they are not defined in the model, but it
            # allows for a more flexible way to define metadata.
            # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
            return {**data, "extra": extra}

        # Otherwise, we are using our mock-dict implentation, so we store our
        # metadata in the `extra` field.
        return {"extra": data}

    def _is_field_typed(cls):
        Check if the metadata model is field typed.

        This is used to determine if the metadata model is a dictionary-like model,
        or a more strictly typed model.
        if set(["extra"]) != set(cls.model_fields.keys()):
            return True

        return False

    def __repr__(self):
        Provide a string representation of the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __repr__ method.
        if self._is_field_typed():
            return super().__repr__()

        # Otherwise, we are deailing with dictornary-like metadata
        return json.dumps(self.extra)

    def __getitem__(self, name):
        Provide dictionary functionlaity to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __getitem__ method.
        if not self._is_field_typed():
            return self.extra[name]

        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"

    def __setitem__(self, name, value):
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __setitem__ method.
        if not self._is_field_typed():
            self.extra[name] = value
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"

    def __delitem__(self, name):
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __delitem__ method.
        if not self._is_field_typed():
            del self.extra[name]
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"

    def __iter__(self):
        Iterate over the keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __iter__ method.
        if self._is_field_typed():
            raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

        return iter(self.extra)

    def __len__(self):
        Get the number of keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __len__ method.
        if self._is_field_typed():
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '__len__'"

        return len(self.extra)

    def __getattr__(self, name):
        Allow for getting of attributes on the metadata class.

        The attributes are retrieved through the following heirarchy:
            - If the model is sub-classed, it will be retrieved as normal.
            - Otherwise, if the attribute is private, it will be retrieved as normal.
            - Finally, if we are getting a public attribute on the base metadata class,
                we use the extra field.
            - If the key is not set in the `extra` dictionary, we resort back to just
            trying to get the field.
                - This is when we grab the `owner` or `task_result` attribuite.
        if self._is_field_typed():
            return super().__getattr__(name)

        if name.startswith("_"):
            return super().__getattr__(name)

        # Attempt to retreieve the attr from the `extra` field
            return self.extra.get(name)

        except KeyError:
            # This is for grabbing properties on the base metadata class
            return super().__getattr__(name)

    def __setattr__(self, name: str, value: Any) -> None:
        Allow for setting of attributes on the metadata class.

        The attributes are set through the following heirarchy:
            - If the model is sub-classed, it will be set as normal.
            - Otherwise, if the attribute is private, it will be set as normal.
            - Finally, if we are setting a public attribute on the base metadata class,
                we use the extra field.
        if self._is_field_typed():
            return super().__setattr__(name, value)

        # We want to avoid setting any private attributes in the extra
        # dictionary
        if name.startswith("_"):
            return super().__setattr__(name, value)

        # If it is `owner` or `task_results`, we want
        # to avoid setting the attribute in the `extra` dictionary
        if name in ["owner", "task_results"]:
            return super().__setattr__(name, value)

        self.extra[name] = value

    def __delattr__(self, name: str) -> None:
        Ensure that we can delete attributes from the metadata class.

        The attributes are deleted through the following heirarchy:
            - If the attribute is `task_results`, we use the descriptor to delete the task results.
            - Otherwise, if it is a sub-classed model, it will be deleted as normal.
            - Finally, if we are deleting a public attribute on the base metadata class,
                we use the extra field.

        # We want to use the descriptor to delete the task results
        if name == "task_results":

        # Otherwise, we use our standard fallback tiers
        if self._is_field_typed():
            return super().__delattr__(name)

        del self.extra[name]

owner: TMetadataOwner property writable

Return the owner of the metadata.

task_results: TaskResultsDescriptor property writable

Return the task results descriptor.


Ensure that we can delete attributes from the metadata class.

The attributes are deleted through the following heirarchy
  • If the attribute is task_results, we use the descriptor to delete the task results.
  • Otherwise, if it is a sub-classed model, it will be deleted as normal.
  • Finally, if we are deleting a public attribute on the base metadata class, we use the extra field.
Source code in docprompt/schema/pipeline/
def __delattr__(self, name: str) -> None:
    Ensure that we can delete attributes from the metadata class.

    The attributes are deleted through the following heirarchy:
        - If the attribute is `task_results`, we use the descriptor to delete the task results.
        - Otherwise, if it is a sub-classed model, it will be deleted as normal.
        - Finally, if we are deleting a public attribute on the base metadata class,
            we use the extra field.

    # We want to use the descriptor to delete the task results
    if name == "task_results":

    # Otherwise, we use our standard fallback tiers
    if self._is_field_typed():
        return super().__delattr__(name)

    del self.extra[name]


Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an delitem method.

Source code in docprompt/schema/pipeline/
def __delitem__(self, name):
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __delitem__ method.
    if not self._is_field_typed():
        del self.extra[name]
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"


Allow for getting of attributes on the metadata class.

The attributes are retrieved through the following heirarchy
  • If the model is sub-classed, it will be retrieved as normal.
  • Otherwise, if the attribute is private, it will be retrieved as normal.
  • Finally, if we are getting a public attribute on the base metadata class, we use the extra field.
  • If the key is not set in the extra dictionary, we resort back to just trying to get the field.
    • This is when we grab the owner or task_result attribuite.
Source code in docprompt/schema/pipeline/
def __getattr__(self, name):
    Allow for getting of attributes on the metadata class.

    The attributes are retrieved through the following heirarchy:
        - If the model is sub-classed, it will be retrieved as normal.
        - Otherwise, if the attribute is private, it will be retrieved as normal.
        - Finally, if we are getting a public attribute on the base metadata class,
            we use the extra field.
        - If the key is not set in the `extra` dictionary, we resort back to just
        trying to get the field.
            - This is when we grab the `owner` or `task_result` attribuite.
    if self._is_field_typed():
        return super().__getattr__(name)

    if name.startswith("_"):
        return super().__getattr__(name)

    # Attempt to retreieve the attr from the `extra` field
        return self.extra.get(name)

    except KeyError:
        # This is for grabbing properties on the base metadata class
        return super().__getattr__(name)


Provide dictionary functionlaity to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an getitem method.

Source code in docprompt/schema/pipeline/
def __getitem__(self, name):
    Provide dictionary functionlaity to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __getitem__ method.
    if not self._is_field_typed():
        return self.extra[name]

    raise AttributeError(
        f"'{self.__class__.__name__}' object has no attribute '{name}'"


Iterate over the keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an iter method.

Source code in docprompt/schema/pipeline/
def __iter__(self):
    Iterate over the keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __iter__ method.
    if self._is_field_typed():
        raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

    return iter(self.extra)


Get the number of keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a len method.

Source code in docprompt/schema/pipeline/
def __len__(self):
    Get the number of keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __len__ method.
    if self._is_field_typed():
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '__len__'"

    return len(self.extra)


Provide a string representation of the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a repr method.

Source code in docprompt/schema/pipeline/
def __repr__(self):
    Provide a string representation of the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __repr__ method.
    if self._is_field_typed():
        return super().__repr__()

    # Otherwise, we are deailing with dictornary-like metadata
    return json.dumps(self.extra)

__setattr__(name, value)

Allow for setting of attributes on the metadata class.

The attributes are set through the following heirarchy
  • If the model is sub-classed, it will be set as normal.
  • Otherwise, if the attribute is private, it will be set as normal.
  • Finally, if we are setting a public attribute on the base metadata class, we use the extra field.
Source code in docprompt/schema/pipeline/
def __setattr__(self, name: str, value: Any) -> None:
    Allow for setting of attributes on the metadata class.

    The attributes are set through the following heirarchy:
        - If the model is sub-classed, it will be set as normal.
        - Otherwise, if the attribute is private, it will be set as normal.
        - Finally, if we are setting a public attribute on the base metadata class,
            we use the extra field.
    if self._is_field_typed():
        return super().__setattr__(name, value)

    # We want to avoid setting any private attributes in the extra
    # dictionary
    if name.startswith("_"):
        return super().__setattr__(name, value)

    # If it is `owner` or `task_results`, we want
    # to avoid setting the attribute in the `extra` dictionary
    if name in ["owner", "task_results"]:
        return super().__setattr__(name, value)

    self.extra[name] = value

__setitem__(name, value)

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an setitem method.

Source code in docprompt/schema/pipeline/
def __setitem__(self, name, value):
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __setitem__ method.
    if not self._is_field_typed():
        self.extra[name] = value
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"

from_owner(owner, **data) classmethod

Create a new instance of the metadata class with the owner set.

Source code in docprompt/schema/pipeline/
def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
    """Create a new instance of the metadata class with the owner set."""
    metadata = cls(**data)
    metadata.owner = owner
    return metadata

validate_data_fields_from_annotations(data) classmethod

Validate the data fields from the annotations.

Source code in docprompt/schema/pipeline/
def validate_data_fields_from_annotations(cls, data: Any) -> Any:
    """Validate the data fields from the annotations."""

    # We want to make sure that we combine the `extra` metdata along with any
    # other specific fields that are defined in the metadata.
    extra = data.pop("extra", {})
    assert isinstance(extra, dict), "The `extra` field must be a dictionary."
    data = {**data, **extra}

    # If the model has been sub-classed, then all of our fields must be
    # validated by the pydantic model.
    if cls._is_field_typed():
        # We will get the fields out of extra and set them as potential fields to
        # validate. They will be ignored if they are not defined in the model, but it
        # allows for a more flexible way to define metadata.
        # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
        return {**data, "extra": extra}

    # Otherwise, we are using our mock-dict implentation, so we store our
    # metadata in the `extra` field.
    return {"extra": data}


Bases: BaseModel, Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata]

Represents a collection of documents with some common metadata

Source code in docprompt/schema/pipeline/node/
class DocumentCollection(
    Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata],
    Represents a collection of documents with some common metadata

    document_nodes: List["DocumentNode[DocumentNodeMetadata, PageNodeMetadata]"]
    metadata: DocumentCollectionMetadata = Field(..., default_factory=dict)


Bases: BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]

Represents a single document, with some metadata

Source code in docprompt/schema/pipeline/node/
class DocumentNode(BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]):
    Represents a single document, with some metadata

    document: PdfDocument
    page_nodes: List[PageNode[PageNodeMetadata]] = Field(
        description="The pages in the document", default_factory=list, repr=False
    metadata: DocumentNodeMetadata = Field(
        description="Application-specific metadata for the document",

    _locator: Optional["DocumentProvenanceLocator"] = PrivateAttr(default=None)

    _persistance_path: Optional[str] = PrivateAttr(default=None)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_locator"] = None

        return state

    def __len__(self):
        return len(self.page_nodes)

    def __getitem__(self, index):
        return self.page_nodes[index]

    def __iter__(self):
        return iter(self.page_nodes)

    def rasterizer(self):
        return DocumentRasterizer(self)

    def locator(self):
        if self._locator is None:

        return self._locator

    def refresh_locator(self):
        Refreshes the locator for this document node
        from import DocumentProvenanceLocator

        if any(not page.ocr_results.result for page in self.page_nodes):
            raise ValueError(
                "Cannot create a locator for a document node with missing OCR results"

        self._locator = DocumentProvenanceLocator.from_document_node(self)

        return self.locator

    def from_document(
        document: PdfDocument,
        document_metadata: Optional[DocumentNodeMetadata] = None,
        page_metadata: Optional[List[PageNodeMetadata]] = None,
        document_node: "DocumentNode[DocumentNodeMetadata, PageNodeMetadata]" = cls(
        document_node.metadata = document_metadata or cls.metadata_class().from_owner(
            document_node, **{}

        if page_metadata is not None and len(page_metadata) != len(document):
            raise ValueError(
                "The number of page metadata items must match the number of pages in the document."

        for page_number in range(1, len(document) + 1):
            if page_metadata is not None:
                page_node = PageNode(
                    metadata=page_metadata[page_number - 1],
                page_node = PageNode(document=document_node, page_number=page_number)


        return document_node

    def file_hash(self):
        return self.document.document_hash

    def document_name(self):

    def metadata_class(cls) -> Type[BaseMetadata]:
        """Get the metadata class for instantiating metadata from the model."""

        fields = cls.model_fields

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        metadata_field_annotation = fields["metadata"].annotation

        # If no override has been provided to the metadata model, we want to retrieve
        # it as a TypedDict
        if metadata_field_annotation == DocumentNodeMetadata:
            return BaseMetadata

        if isinstance(metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."

        # Get the overriden Generic type of th DocumentNodeMetadata
        return metadata_field_annotation

    def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
        """Get the metadata class for the page nodes in the document."""
        fields = cls.model_fields

        # NOTE: The indexing is important here, and it allows us to get the type of each
        # page node in the `List` annotation
        page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        page_node_metadata_field_annotation = page_nodes_field_class.model_fields[

        if page_node_metadata_field_annotation == PageNodeMetadata:
            return BaseMetadata

        if isinstance(page_node_metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define PageNode with a ForwardRef for Generic metadata model types."

        return page_node_metadata_field_annotation

    def persistance_path(self):
        """The base path to storage location."""
        return self._persistance_path

    def persistance_path(self, path: str):
        """Set the base path to storage location."""
        self._persistance_path = path

    def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
        """Load the document node from storage.

            path (str): The base path to storage location.
                - Example (S3): "s3://bucket-name/key/to/folder"
                - Example (Local FS): "/tmp/docprompt/storage"
            file_hash (str): The hash of the document.
            **kwargs: Additional keyword arguments for fsspec FileSystem

            DocumentNode: The loaded document node.

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes, metadata_bytes, page_metadata_bytes =
            file_hash, **kwargs

        doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
        node = cls.from_document(doc)

        if metadata_bytes:
            metadata_json = json.loads(metadata_bytes.decode("utf-8"))
            metadata = cls.metadata_class().from_owner(node, **metadata_json)
            metadata = cls.metadata_class().from_owner(node, **{})

        if page_metadata_bytes:
            page_metadata_json = [
                for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
            page_metadata = [
                cls.page_metadata_class()(**page) for page in page_metadata_json
            page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

        # Store the metadata on the node and page nodes
        node.metadata = metadata
        for page, meta in zip(node.page_nodes, page_metadata):
            page.metadata = meta

        # Make sure to set the persistance path on the node
        node.persistance_path = path

        return node

    def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
        """Persist a document node to storage.

            path (Optional[str]): Overwrites the current `persistance_path` property
                - If `persistance_path` is not currently set, path must be provided.
            **kwargs: Additional keyword arguments for fsspec FileSystem

            FileSidecarsPathManager: The file path manager for the persisted document node.

        path = path or self.persistance_path

        if path is None:
            raise ValueError("The path must be provided to persist the document node.")

        # Make sure to update the persistance path
        self.persistance_path = path

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes = self.document.get_bytes()
        metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
        page_metadata_bytes = bytes(
            json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),

        return fs_manager.write(
            pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs

persistance_path property writable

The base path to storage location.

from_storage(path, file_hash, **kwargs) classmethod

Load the document node from storage.


Name Type Description Default
path str

The base path to storage location. - Example (S3): "s3://bucket-name/key/to/folder" - Example (Local FS): "/tmp/docprompt/storage"

file_hash str

The hash of the document.


Additional keyword arguments for fsspec FileSystem



Name Type Description
DocumentNode Self

The loaded document node.

Source code in docprompt/schema/pipeline/node/
def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
    """Load the document node from storage.

        path (str): The base path to storage location.
            - Example (S3): "s3://bucket-name/key/to/folder"
            - Example (Local FS): "/tmp/docprompt/storage"
        file_hash (str): The hash of the document.
        **kwargs: Additional keyword arguments for fsspec FileSystem

        DocumentNode: The loaded document node.

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes, metadata_bytes, page_metadata_bytes =
        file_hash, **kwargs

    doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
    node = cls.from_document(doc)

    if metadata_bytes:
        metadata_json = json.loads(metadata_bytes.decode("utf-8"))
        metadata = cls.metadata_class().from_owner(node, **metadata_json)
        metadata = cls.metadata_class().from_owner(node, **{})

    if page_metadata_bytes:
        page_metadata_json = [
            for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
        page_metadata = [
            cls.page_metadata_class()(**page) for page in page_metadata_json
        page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

    # Store the metadata on the node and page nodes
    node.metadata = metadata
    for page, meta in zip(node.page_nodes, page_metadata):
        page.metadata = meta

    # Make sure to set the persistance path on the node
    node.persistance_path = path

    return node

metadata_class() classmethod

Get the metadata class for instantiating metadata from the model.

Source code in docprompt/schema/pipeline/node/
def metadata_class(cls) -> Type[BaseMetadata]:
    """Get the metadata class for instantiating metadata from the model."""

    fields = cls.model_fields

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    metadata_field_annotation = fields["metadata"].annotation

    # If no override has been provided to the metadata model, we want to retrieve
    # it as a TypedDict
    if metadata_field_annotation == DocumentNodeMetadata:
        return BaseMetadata

    if isinstance(metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."

    # Get the overriden Generic type of th DocumentNodeMetadata
    return metadata_field_annotation

page_metadata_class() classmethod

Get the metadata class for the page nodes in the document.

Source code in docprompt/schema/pipeline/node/
def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
    """Get the metadata class for the page nodes in the document."""
    fields = cls.model_fields

    # NOTE: The indexing is important here, and it allows us to get the type of each
    # page node in the `List` annotation
    page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    page_node_metadata_field_annotation = page_nodes_field_class.model_fields[

    if page_node_metadata_field_annotation == PageNodeMetadata:
        return BaseMetadata

    if isinstance(page_node_metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define PageNode with a ForwardRef for Generic metadata model types."

    return page_node_metadata_field_annotation

persist(path=None, **kwargs)

Persist a document node to storage.


Name Type Description Default
path Optional[str]

Overwrites the current persistance_path property - If persistance_path is not currently set, path must be provided.


Additional keyword arguments for fsspec FileSystem



Name Type Description
FileSidecarsPathManager FileSidecarsPathManager

The file path manager for the persisted document node.

Source code in docprompt/schema/pipeline/node/
def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
    """Persist a document node to storage.

        path (Optional[str]): Overwrites the current `persistance_path` property
            - If `persistance_path` is not currently set, path must be provided.
        **kwargs: Additional keyword arguments for fsspec FileSystem

        FileSidecarsPathManager: The file path manager for the persisted document node.

    path = path or self.persistance_path

    if path is None:
        raise ValueError("The path must be provided to persist the document node.")

    # Make sure to update the persistance path
    self.persistance_path = path

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes = self.document.get_bytes()
    metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
    page_metadata_bytes = bytes(
        json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),

    return fs_manager.write(
        pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs


Refreshes the locator for this document node

Source code in docprompt/schema/pipeline/node/
def refresh_locator(self):
    Refreshes the locator for this document node
    from import DocumentProvenanceLocator

    if any(not page.ocr_results.result for page in self.page_nodes):
        raise ValueError(
            "Cannot create a locator for a document node with missing OCR results"

    self._locator = DocumentProvenanceLocator.from_document_node(self)

    return self.locator


Bases: BaseNode, Generic[PageNodeMetadata]

Represents a single page in a document, with some metadata

Source code in docprompt/schema/pipeline/node/
class PageNode(BaseNode, Generic[PageNodeMetadata]):
    Represents a single page in a document, with some metadata

    document: "DocumentNode" = Field(exclude=True, repr=False)
    page_number: PositiveInt = Field(description="The page number")
    metadata: PageNodeMetadata = Field(
        description="Application-specific metadata for the page",
    extra: Dict[str, Any] = Field(
        description="Extra data that can be stored on the page node",

    ocr_results: ResultContainer[OcrPageResult] = Field(
        description="The OCR results for the page",

    _raster_cache: Dict[str, bytes] = PrivateAttr(default_factory=dict)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_raster_cache"] = {}

        return state

    def rasterizer(self):
        return PageRasterizer(self._raster_cache, self)

    def search(
        self, query: str, refine_to_words: bool = True, require_exact_match: bool = True


The metadata class is utilized for defining a basic, yet flexible interface for metadata attached to various fields.

In essence, this allows for developers to choose to either create their metadtata in an unstructured manner (i.e. a dictionary), or to sub class the base metadata class in order to create a more strictly typed metadata model for their page and document nodes.


Bases: BaseModel, MutableMapping, Generic[TMetadataOwner]

The base metadata class is utilized for defining a basic yet flexible interface for metadata attached to various fields.

When used out of the box, the metadata class will adobpt dictionary-like behavior. You may easily access different fields of the metadata as if it were a dictionary:

# Instantiate it with any kwargs you like
metadata = BaseMetadata(foo-'bar', cow='moo')

metadata["foo"]  # "bar"
metadata["cow"]  # "moo"

# Update the value of the key
metadata["foo"] = "fighters"

# Set new key-value pairs
metadata['sheep'] = 'baa'

Otherwise, you may sub-class the metadata class in order to create a more strictly typed metadata model. This is useful when you want to enforce a specific structure for your metadata.

class CustomMetadata(BaseMetadata):
    foo: str
    cow: str

# Instantiate it with the required fields
metadata = CustomMetadata(foo='bar', cow='moo')  # "bar"
metadata.cow  # "moo"

# Update the value of the key = "fighters"

# Use the extra field to store dynamic metadata
metadata.extra['sheep'] = 'baa'

Additionally, the task results descriptor allows for controlled and easy access to the task results of various tasks that are run on the parent node.

Source code in docprompt/schema/pipeline/
class BaseMetadata(BaseModel, MutableMapping, Generic[TMetadataOwner]):
    The base metadata class is utilized for defining a basic yet flexible interface
    for metadata attached to various fields.

    The metadata class can be used in two ways:
        1. As a dictionary-like object, where metadata is stored in the `extra` field.
        2. As a sub-classed model, where metadata is stored in the fields of the model.

    When used out of the box, the metadata class will adobpt dictionary-like behavior. You
    may easily access different fields of the metadata as if it were a dictionary:
    # Instantiate it with any kwargs you like
    metadata = BaseMetadata(foo-'bar', cow='moo')

    metadata["foo"]  # "bar"
    metadata["cow"]  # "moo"

    # Update the value of the key
    metadata["foo"] = "fighters"

    # Set new key-value pairs
    metadata['sheep'] = 'baa'

    Otherwise, you may sub-class the metadata class in order to create a more strictly typed
    metadata model. This is useful when you want to enforce a specific structure for your metadata.

    class CustomMetadata(BaseMetadata):
        foo: str
        cow: str

    # Instantiate it with the required fields
    metadata = CustomMetadata(foo='bar', cow='moo')  # "bar"
    metadata.cow  # "moo"

    # Update the value of the key = "fighters"

    # Use the extra field to store dynamic metadata
    metadata.extra['sheep'] = 'baa'

    Additionally, the task results descriptor allows for controlled and easy access to the task results
    of various tasks that are run on the parent node.

    extra: Dict[str, Any] = Field(..., default_factory=dict, repr=False)

    _task_results: TaskResultsDescriptor = PrivateAttr(

    _owner: TMetadataOwner = PrivateAttr()

    def task_results(self) -> TaskResultsDescriptor:
        """Return the task results descriptor."""
        return self._task_results.__get__(self)

    def task_results(self, value: Any) -> None:
        """This will raise an error, as we do not want to set the task results directly.

        NOTE: This implementation is here purely to avoid the task_results property from being
        overwritten by accident.
        self._task_results.__set__(self, value)

    def owner(self) -> TMetadataOwner:
        """Return the owner of the metadata."""
        return self._owner

    def owner(self, owner: TMetadataOwner) -> None:
        """Return the owner of the metadata."""
        self._owner = owner

    def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
        """Create a new instance of the metadata class with the owner set."""
        metadata = cls(**data)
        metadata.owner = owner
        return metadata

    def validate_data_fields_from_annotations(cls, data: Any) -> Any:
        """Validate the data fields from the annotations."""

        # We want to make sure that we combine the `extra` metdata along with any
        # other specific fields that are defined in the metadata.
        extra = data.pop("extra", {})
        assert isinstance(extra, dict), "The `extra` field must be a dictionary."
        data = {**data, **extra}

        # If the model has been sub-classed, then all of our fields must be
        # validated by the pydantic model.
        if cls._is_field_typed():
            # We will get the fields out of extra and set them as potential fields to
            # validate. They will be ignored if they are not defined in the model, but it
            # allows for a more flexible way to define metadata.
            # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
            return {**data, "extra": extra}

        # Otherwise, we are using our mock-dict implentation, so we store our
        # metadata in the `extra` field.
        return {"extra": data}

    def _is_field_typed(cls):
        Check if the metadata model is field typed.

        This is used to determine if the metadata model is a dictionary-like model,
        or a more strictly typed model.
        if set(["extra"]) != set(cls.model_fields.keys()):
            return True

        return False

    def __repr__(self):
        Provide a string representation of the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __repr__ method.
        if self._is_field_typed():
            return super().__repr__()

        # Otherwise, we are deailing with dictornary-like metadata
        return json.dumps(self.extra)

    def __getitem__(self, name):
        Provide dictionary functionlaity to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __getitem__ method.
        if not self._is_field_typed():
            return self.extra[name]

        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"

    def __setitem__(self, name, value):
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __setitem__ method.
        if not self._is_field_typed():
            self.extra[name] = value
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"

    def __delitem__(self, name):
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __delitem__ method.
        if not self._is_field_typed():
            del self.extra[name]
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"

    def __iter__(self):
        Iterate over the keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __iter__ method.
        if self._is_field_typed():
            raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

        return iter(self.extra)

    def __len__(self):
        Get the number of keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __len__ method.
        if self._is_field_typed():
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '__len__'"

        return len(self.extra)

    def __getattr__(self, name):
        Allow for getting of attributes on the metadata class.

        The attributes are retrieved through the following heirarchy:
            - If the model is sub-classed, it will be retrieved as normal.
            - Otherwise, if the attribute is private, it will be retrieved as normal.
            - Finally, if we are getting a public attribute on the base metadata class,
                we use the extra field.
            - If the key is not set in the `extra` dictionary, we resort back to just
            trying to get the field.
                - This is when we grab the `owner` or `task_result` attribuite.
        if self._is_field_typed():
            return super().__getattr__(name)

        if name.startswith("_"):
            return super().__getattr__(name)

        # Attempt to retreieve the attr from the `extra` field
            return self.extra.get(name)

        except KeyError:
            # This is for grabbing properties on the base metadata class
            return super().__getattr__(name)

    def __setattr__(self, name: str, value: Any) -> None:
        Allow for setting of attributes on the metadata class.

        The attributes are set through the following heirarchy:
            - If the model is sub-classed, it will be set as normal.
            - Otherwise, if the attribute is private, it will be set as normal.
            - Finally, if we are setting a public attribute on the base metadata class,
                we use the extra field.
        if self._is_field_typed():
            return super().__setattr__(name, value)

        # We want to avoid setting any private attributes in the extra
        # dictionary
        if name.startswith("_"):
            return super().__setattr__(name, value)

        # If it is `owner` or `task_results`, we want
        # to avoid setting the attribute in the `extra` dictionary
        if name in ["owner", "task_results"]:
            return super().__setattr__(name, value)

        self.extra[name] = value

    def __delattr__(self, name: str) -> None:
        Ensure that we can delete attributes from the metadata class.

        The attributes are deleted through the following heirarchy:
            - If the attribute is `task_results`, we use the descriptor to delete the task results.
            - Otherwise, if it is a sub-classed model, it will be deleted as normal.
            - Finally, if we are deleting a public attribute on the base metadata class,
                we use the extra field.

        # We want to use the descriptor to delete the task results
        if name == "task_results":

        # Otherwise, we use our standard fallback tiers
        if self._is_field_typed():
            return super().__delattr__(name)

        del self.extra[name]
owner: TMetadataOwner property writable

Return the owner of the metadata.

task_results: TaskResultsDescriptor property writable

Return the task results descriptor.


Ensure that we can delete attributes from the metadata class.

The attributes are deleted through the following heirarchy
  • If the attribute is task_results, we use the descriptor to delete the task results.
  • Otherwise, if it is a sub-classed model, it will be deleted as normal.
  • Finally, if we are deleting a public attribute on the base metadata class, we use the extra field.
Source code in docprompt/schema/pipeline/
def __delattr__(self, name: str) -> None:
    Ensure that we can delete attributes from the metadata class.

    The attributes are deleted through the following heirarchy:
        - If the attribute is `task_results`, we use the descriptor to delete the task results.
        - Otherwise, if it is a sub-classed model, it will be deleted as normal.
        - Finally, if we are deleting a public attribute on the base metadata class,
            we use the extra field.

    # We want to use the descriptor to delete the task results
    if name == "task_results":

    # Otherwise, we use our standard fallback tiers
    if self._is_field_typed():
        return super().__delattr__(name)

    del self.extra[name]

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an delitem method.

Source code in docprompt/schema/pipeline/
def __delitem__(self, name):
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __delitem__ method.
    if not self._is_field_typed():
        del self.extra[name]
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"

Allow for getting of attributes on the metadata class.

The attributes are retrieved through the following heirarchy
  • If the model is sub-classed, it will be retrieved as normal.
  • Otherwise, if the attribute is private, it will be retrieved as normal.
  • Finally, if we are getting a public attribute on the base metadata class, we use the extra field.
  • If the key is not set in the extra dictionary, we resort back to just trying to get the field.
    • This is when we grab the owner or task_result attribuite.
Source code in docprompt/schema/pipeline/
def __getattr__(self, name):
    Allow for getting of attributes on the metadata class.

    The attributes are retrieved through the following heirarchy:
        - If the model is sub-classed, it will be retrieved as normal.
        - Otherwise, if the attribute is private, it will be retrieved as normal.
        - Finally, if we are getting a public attribute on the base metadata class,
            we use the extra field.
        - If the key is not set in the `extra` dictionary, we resort back to just
        trying to get the field.
            - This is when we grab the `owner` or `task_result` attribuite.
    if self._is_field_typed():
        return super().__getattr__(name)

    if name.startswith("_"):
        return super().__getattr__(name)

    # Attempt to retreieve the attr from the `extra` field
        return self.extra.get(name)

    except KeyError:
        # This is for grabbing properties on the base metadata class
        return super().__getattr__(name)

Provide dictionary functionlaity to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an getitem method.

Source code in docprompt/schema/pipeline/
def __getitem__(self, name):
    Provide dictionary functionlaity to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __getitem__ method.
    if not self._is_field_typed():
        return self.extra[name]

    raise AttributeError(
        f"'{self.__class__.__name__}' object has no attribute '{name}'"

Iterate over the keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an iter method.

Source code in docprompt/schema/pipeline/
def __iter__(self):
    Iterate over the keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __iter__ method.
    if self._is_field_typed():
        raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

    return iter(self.extra)

Get the number of keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a len method.

Source code in docprompt/schema/pipeline/
def __len__(self):
    Get the number of keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __len__ method.
    if self._is_field_typed():
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '__len__'"

    return len(self.extra)

Provide a string representation of the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a repr method.

Source code in docprompt/schema/pipeline/
def __repr__(self):
    Provide a string representation of the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __repr__ method.
    if self._is_field_typed():
        return super().__repr__()

    # Otherwise, we are deailing with dictornary-like metadata
    return json.dumps(self.extra)
__setattr__(name, value)

Allow for setting of attributes on the metadata class.

The attributes are set through the following heirarchy
  • If the model is sub-classed, it will be set as normal.
  • Otherwise, if the attribute is private, it will be set as normal.
  • Finally, if we are setting a public attribute on the base metadata class, we use the extra field.
Source code in docprompt/schema/pipeline/
def __setattr__(self, name: str, value: Any) -> None:
    Allow for setting of attributes on the metadata class.

    The attributes are set through the following heirarchy:
        - If the model is sub-classed, it will be set as normal.
        - Otherwise, if the attribute is private, it will be set as normal.
        - Finally, if we are setting a public attribute on the base metadata class,
            we use the extra field.
    if self._is_field_typed():
        return super().__setattr__(name, value)

    # We want to avoid setting any private attributes in the extra
    # dictionary
    if name.startswith("_"):
        return super().__setattr__(name, value)

    # If it is `owner` or `task_results`, we want
    # to avoid setting the attribute in the `extra` dictionary
    if name in ["owner", "task_results"]:
        return super().__setattr__(name, value)

    self.extra[name] = value
__setitem__(name, value)

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an setitem method.

Source code in docprompt/schema/pipeline/
def __setitem__(self, name, value):
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __setitem__ method.
    if not self._is_field_typed():
        self.extra[name] = value
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
from_owner(owner, **data) classmethod

Create a new instance of the metadata class with the owner set.

Source code in docprompt/schema/pipeline/
def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
    """Create a new instance of the metadata class with the owner set."""
    metadata = cls(**data)
    metadata.owner = owner
    return metadata
validate_data_fields_from_annotations(data) classmethod

Validate the data fields from the annotations.

Source code in docprompt/schema/pipeline/
def validate_data_fields_from_annotations(cls, data: Any) -> Any:
    """Validate the data fields from the annotations."""

    # We want to make sure that we combine the `extra` metdata along with any
    # other specific fields that are defined in the metadata.
    extra = data.pop("extra", {})
    assert isinstance(extra, dict), "The `extra` field must be a dictionary."
    data = {**data, **extra}

    # If the model has been sub-classed, then all of our fields must be
    # validated by the pydantic model.
    if cls._is_field_typed():
        # We will get the fields out of extra and set them as potential fields to
        # validate. They will be ignored if they are not defined in the model, but it
        # allows for a more flexible way to define metadata.
        # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
        return {**data, "extra": extra}

    # Otherwise, we are using our mock-dict implentation, so we store our
    # metadata in the `extra` field.
    return {"extra": data}



Bases: BaseModel, Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata]

Represents a collection of documents with some common metadata

Source code in docprompt/schema/pipeline/node/
class DocumentCollection(
    Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata],
    Represents a collection of documents with some common metadata

    document_nodes: List["DocumentNode[DocumentNodeMetadata, PageNodeMetadata]"]
    metadata: DocumentCollectionMetadata = Field(..., default_factory=dict)


Bases: BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]

Represents a single document, with some metadata

Source code in docprompt/schema/pipeline/node/
class DocumentNode(BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]):
    Represents a single document, with some metadata

    document: PdfDocument
    page_nodes: List[PageNode[PageNodeMetadata]] = Field(
        description="The pages in the document", default_factory=list, repr=False
    metadata: DocumentNodeMetadata = Field(
        description="Application-specific metadata for the document",

    _locator: Optional["DocumentProvenanceLocator"] = PrivateAttr(default=None)

    _persistance_path: Optional[str] = PrivateAttr(default=None)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_locator"] = None

        return state

    def __len__(self):
        return len(self.page_nodes)

    def __getitem__(self, index):
        return self.page_nodes[index]

    def __iter__(self):
        return iter(self.page_nodes)

    def rasterizer(self):
        return DocumentRasterizer(self)

    def locator(self):
        if self._locator is None:

        return self._locator

    def refresh_locator(self):
        Refreshes the locator for this document node
        from import DocumentProvenanceLocator

        if any(not page.ocr_results.result for page in self.page_nodes):
            raise ValueError(
                "Cannot create a locator for a document node with missing OCR results"

        self._locator = DocumentProvenanceLocator.from_document_node(self)

        return self.locator

    def from_document(
        document: PdfDocument,
        document_metadata: Optional[DocumentNodeMetadata] = None,
        page_metadata: Optional[List[PageNodeMetadata]] = None,
        document_node: "DocumentNode[DocumentNodeMetadata, PageNodeMetadata]" = cls(
        document_node.metadata = document_metadata or cls.metadata_class().from_owner(
            document_node, **{}

        if page_metadata is not None and len(page_metadata) != len(document):
            raise ValueError(
                "The number of page metadata items must match the number of pages in the document."

        for page_number in range(1, len(document) + 1):
            if page_metadata is not None:
                page_node = PageNode(
                    metadata=page_metadata[page_number - 1],
                page_node = PageNode(document=document_node, page_number=page_number)


        return document_node

    def file_hash(self):
        return self.document.document_hash

    def document_name(self):

    def metadata_class(cls) -> Type[BaseMetadata]:
        """Get the metadata class for instantiating metadata from the model."""

        fields = cls.model_fields

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        metadata_field_annotation = fields["metadata"].annotation

        # If no override has been provided to the metadata model, we want to retrieve
        # it as a TypedDict
        if metadata_field_annotation == DocumentNodeMetadata:
            return BaseMetadata

        if isinstance(metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."

        # Get the overriden Generic type of th DocumentNodeMetadata
        return metadata_field_annotation

    def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
        """Get the metadata class for the page nodes in the document."""
        fields = cls.model_fields

        # NOTE: The indexing is important here, and it allows us to get the type of each
        # page node in the `List` annotation
        page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        page_node_metadata_field_annotation = page_nodes_field_class.model_fields[

        if page_node_metadata_field_annotation == PageNodeMetadata:
            return BaseMetadata

        if isinstance(page_node_metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define PageNode with a ForwardRef for Generic metadata model types."

        return page_node_metadata_field_annotation

    def persistance_path(self):
        """The base path to storage location."""
        return self._persistance_path

    def persistance_path(self, path: str):
        """Set the base path to storage location."""
        self._persistance_path = path

    def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
        """Load the document node from storage.

            path (str): The base path to storage location.
                - Example (S3): "s3://bucket-name/key/to/folder"
                - Example (Local FS): "/tmp/docprompt/storage"
            file_hash (str): The hash of the document.
            **kwargs: Additional keyword arguments for fsspec FileSystem

            DocumentNode: The loaded document node.

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes, metadata_bytes, page_metadata_bytes =
            file_hash, **kwargs

        doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
        node = cls.from_document(doc)

        if metadata_bytes:
            metadata_json = json.loads(metadata_bytes.decode("utf-8"))
            metadata = cls.metadata_class().from_owner(node, **metadata_json)
            metadata = cls.metadata_class().from_owner(node, **{})

        if page_metadata_bytes:
            page_metadata_json = [
                for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
            page_metadata = [
                cls.page_metadata_class()(**page) for page in page_metadata_json
            page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

        # Store the metadata on the node and page nodes
        node.metadata = metadata
        for page, meta in zip(node.page_nodes, page_metadata):
            page.metadata = meta

        # Make sure to set the persistance path on the node
        node.persistance_path = path

        return node

    def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
        """Persist a document node to storage.

            path (Optional[str]): Overwrites the current `persistance_path` property
                - If `persistance_path` is not currently set, path must be provided.
            **kwargs: Additional keyword arguments for fsspec FileSystem

            FileSidecarsPathManager: The file path manager for the persisted document node.

        path = path or self.persistance_path

        if path is None:
            raise ValueError("The path must be provided to persist the document node.")

        # Make sure to update the persistance path
        self.persistance_path = path

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes = self.document.get_bytes()
        metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
        page_metadata_bytes = bytes(
            json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),

        return fs_manager.write(
            pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
persistance_path property writable

The base path to storage location.

from_storage(path, file_hash, **kwargs) classmethod

Load the document node from storage.


Name Type Description Default
path str

The base path to storage location. - Example (S3): "s3://bucket-name/key/to/folder" - Example (Local FS): "/tmp/docprompt/storage"

file_hash str

The hash of the document.


Additional keyword arguments for fsspec FileSystem



Name Type Description
DocumentNode Self

The loaded document node.

Source code in docprompt/schema/pipeline/node/
def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
    """Load the document node from storage.

        path (str): The base path to storage location.
            - Example (S3): "s3://bucket-name/key/to/folder"
            - Example (Local FS): "/tmp/docprompt/storage"
        file_hash (str): The hash of the document.
        **kwargs: Additional keyword arguments for fsspec FileSystem

        DocumentNode: The loaded document node.

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes, metadata_bytes, page_metadata_bytes =
        file_hash, **kwargs

    doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
    node = cls.from_document(doc)

    if metadata_bytes:
        metadata_json = json.loads(metadata_bytes.decode("utf-8"))
        metadata = cls.metadata_class().from_owner(node, **metadata_json)
        metadata = cls.metadata_class().from_owner(node, **{})

    if page_metadata_bytes:
        page_metadata_json = [
            for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
        page_metadata = [
            cls.page_metadata_class()(**page) for page in page_metadata_json
        page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

    # Store the metadata on the node and page nodes
    node.metadata = metadata
    for page, meta in zip(node.page_nodes, page_metadata):
        page.metadata = meta

    # Make sure to set the persistance path on the node
    node.persistance_path = path

    return node
metadata_class() classmethod

Get the metadata class for instantiating metadata from the model.

Source code in docprompt/schema/pipeline/node/
def metadata_class(cls) -> Type[BaseMetadata]:
    """Get the metadata class for instantiating metadata from the model."""

    fields = cls.model_fields

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    metadata_field_annotation = fields["metadata"].annotation

    # If no override has been provided to the metadata model, we want to retrieve
    # it as a TypedDict
    if metadata_field_annotation == DocumentNodeMetadata:
        return BaseMetadata

    if isinstance(metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."

    # Get the overriden Generic type of th DocumentNodeMetadata
    return metadata_field_annotation
page_metadata_class() classmethod

Get the metadata class for the page nodes in the document.

Source code in docprompt/schema/pipeline/node/
def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
    """Get the metadata class for the page nodes in the document."""
    fields = cls.model_fields

    # NOTE: The indexing is important here, and it allows us to get the type of each
    # page node in the `List` annotation
    page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    page_node_metadata_field_annotation = page_nodes_field_class.model_fields[

    if page_node_metadata_field_annotation == PageNodeMetadata:
        return BaseMetadata

    if isinstance(page_node_metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define PageNode with a ForwardRef for Generic metadata model types."

    return page_node_metadata_field_annotation
persist(path=None, **kwargs)

Persist a document node to storage.


Name Type Description Default
path Optional[str]

Overwrites the current persistance_path property - If persistance_path is not currently set, path must be provided.


Additional keyword arguments for fsspec FileSystem



Name Type Description
FileSidecarsPathManager FileSidecarsPathManager

The file path manager for the persisted document node.

Source code in docprompt/schema/pipeline/node/
def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
    """Persist a document node to storage.

        path (Optional[str]): Overwrites the current `persistance_path` property
            - If `persistance_path` is not currently set, path must be provided.
        **kwargs: Additional keyword arguments for fsspec FileSystem

        FileSidecarsPathManager: The file path manager for the persisted document node.

    path = path or self.persistance_path

    if path is None:
        raise ValueError("The path must be provided to persist the document node.")

    # Make sure to update the persistance path
    self.persistance_path = path

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes = self.document.get_bytes()
    metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
    page_metadata_bytes = bytes(
        json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),

    return fs_manager.write(
        pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs

Refreshes the locator for this document node

Source code in docprompt/schema/pipeline/node/
def refresh_locator(self):
    Refreshes the locator for this document node
    from import DocumentProvenanceLocator

    if any(not page.ocr_results.result for page in self.page_nodes):
        raise ValueError(
            "Cannot create a locator for a document node with missing OCR results"

    self._locator = DocumentProvenanceLocator.from_document_node(self)

    return self.locator


Bases: BaseNode, Generic[PageNodeMetadata]

Represents a single page in a document, with some metadata

Source code in docprompt/schema/pipeline/node/
class PageNode(BaseNode, Generic[PageNodeMetadata]):
    Represents a single page in a document, with some metadata

    document: "DocumentNode" = Field(exclude=True, repr=False)
    page_number: PositiveInt = Field(description="The page number")
    metadata: PageNodeMetadata = Field(
        description="Application-specific metadata for the page",
    extra: Dict[str, Any] = Field(
        description="Extra data that can be stored on the page node",

    ocr_results: ResultContainer[OcrPageResult] = Field(
        description="The OCR results for the page",

    _raster_cache: Dict[str, bytes] = PrivateAttr(default_factory=dict)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_raster_cache"] = {}

        return state

    def rasterizer(self):
        return PageRasterizer(self._raster_cache, self)

    def search(
        self, query: str, refine_to_words: bool = True, require_exact_match: bool = True



Bases: BaseModel

The base node class is utilized for defining a basic yet flexible interface

Source code in docprompt/schema/pipeline/node/
class BaseNode(BaseModel):
    """The base node class is utilized for defining a basic yet flexible interface"""



Bases: BaseModel, Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata]

Represents a collection of documents with some common metadata

Source code in docprompt/schema/pipeline/node/
class DocumentCollection(
    Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata],
    Represents a collection of documents with some common metadata

    document_nodes: List["DocumentNode[DocumentNodeMetadata, PageNodeMetadata]"]
    metadata: DocumentCollectionMetadata = Field(..., default_factory=dict)



Bases: BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]

Represents a single document, with some metadata

Source code in docprompt/schema/pipeline/node/
class DocumentNode(BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]):
    Represents a single document, with some metadata

    document: PdfDocument
    page_nodes: List[PageNode[PageNodeMetadata]] = Field(
        description="The pages in the document", default_factory=list, repr=False
    metadata: DocumentNodeMetadata = Field(
        description="Application-specific metadata for the document",

    _locator: Optional["DocumentProvenanceLocator"] = PrivateAttr(default=None)

    _persistance_path: Optional[str] = PrivateAttr(default=None)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_locator"] = None

        return state

    def __len__(self):
        return len(self.page_nodes)

    def __getitem__(self, index):
        return self.page_nodes[index]

    def __iter__(self):
        return iter(self.page_nodes)

    def rasterizer(self):
        return DocumentRasterizer(self)

    def locator(self):
        if self._locator is None:

        return self._locator

    def refresh_locator(self):
        Refreshes the locator for this document node
        from import DocumentProvenanceLocator

        if any(not page.ocr_results.result for page in self.page_nodes):
            raise ValueError(
                "Cannot create a locator for a document node with missing OCR results"

        self._locator = DocumentProvenanceLocator.from_document_node(self)

        return self.locator

    def from_document(
        document: PdfDocument,
        document_metadata: Optional[DocumentNodeMetadata] = None,
        page_metadata: Optional[List[PageNodeMetadata]] = None,
        document_node: "DocumentNode[DocumentNodeMetadata, PageNodeMetadata]" = cls(
        document_node.metadata = document_metadata or cls.metadata_class().from_owner(
            document_node, **{}

        if page_metadata is not None and len(page_metadata) != len(document):
            raise ValueError(
                "The number of page metadata items must match the number of pages in the document."

        for page_number in range(1, len(document) + 1):
            if page_metadata is not None:
                page_node = PageNode(
                    metadata=page_metadata[page_number - 1],
                page_node = PageNode(document=document_node, page_number=page_number)


        return document_node

    def file_hash(self):
        return self.document.document_hash

    def document_name(self):

    def metadata_class(cls) -> Type[BaseMetadata]:
        """Get the metadata class for instantiating metadata from the model."""

        fields = cls.model_fields

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        metadata_field_annotation = fields["metadata"].annotation

        # If no override has been provided to the metadata model, we want to retrieve
        # it as a TypedDict
        if metadata_field_annotation == DocumentNodeMetadata:
            return BaseMetadata

        if isinstance(metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."

        # Get the overriden Generic type of th DocumentNodeMetadata
        return metadata_field_annotation

    def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
        """Get the metadata class for the page nodes in the document."""
        fields = cls.model_fields

        # NOTE: The indexing is important here, and it allows us to get the type of each
        # page node in the `List` annotation
        page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        page_node_metadata_field_annotation = page_nodes_field_class.model_fields[

        if page_node_metadata_field_annotation == PageNodeMetadata:
            return BaseMetadata

        if isinstance(page_node_metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define PageNode with a ForwardRef for Generic metadata model types."

        return page_node_metadata_field_annotation

    def persistance_path(self):
        """The base path to storage location."""
        return self._persistance_path

    def persistance_path(self, path: str):
        """Set the base path to storage location."""
        self._persistance_path = path

    def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
        """Load the document node from storage.

            path (str): The base path to storage location.
                - Example (S3): "s3://bucket-name/key/to/folder"
                - Example (Local FS): "/tmp/docprompt/storage"
            file_hash (str): The hash of the document.
            **kwargs: Additional keyword arguments for fsspec FileSystem

            DocumentNode: The loaded document node.

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes, metadata_bytes, page_metadata_bytes =
            file_hash, **kwargs

        doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
        node = cls.from_document(doc)

        if metadata_bytes:
            metadata_json = json.loads(metadata_bytes.decode("utf-8"))
            metadata = cls.metadata_class().from_owner(node, **metadata_json)
            metadata = cls.metadata_class().from_owner(node, **{})

        if page_metadata_bytes:
            page_metadata_json = [
                for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
            page_metadata = [
                cls.page_metadata_class()(**page) for page in page_metadata_json
            page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

        # Store the metadata on the node and page nodes
        node.metadata = metadata
        for page, meta in zip(node.page_nodes, page_metadata):
            page.metadata = meta

        # Make sure to set the persistance path on the node
        node.persistance_path = path

        return node

    def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
        """Persist a document node to storage.

            path (Optional[str]): Overwrites the current `persistance_path` property
                - If `persistance_path` is not currently set, path must be provided.
            **kwargs: Additional keyword arguments for fsspec FileSystem

            FileSidecarsPathManager: The file path manager for the persisted document node.

        path = path or self.persistance_path

        if path is None:
            raise ValueError("The path must be provided to persist the document node.")

        # Make sure to update the persistance path
        self.persistance_path = path

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes = self.document.get_bytes()
        metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
        page_metadata_bytes = bytes(
            json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),

        return fs_manager.write(
            pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
persistance_path property writable

The base path to storage location.

from_storage(path, file_hash, **kwargs) classmethod

Load the document node from storage.


Name Type Description Default
path str

The base path to storage location. - Example (S3): "s3://bucket-name/key/to/folder" - Example (Local FS): "/tmp/docprompt/storage"

file_hash str

The hash of the document.


Additional keyword arguments for fsspec FileSystem



Name Type Description
DocumentNode Self

The loaded document node.

Source code in docprompt/schema/pipeline/node/
def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
    """Load the document node from storage.

        path (str): The base path to storage location.
            - Example (S3): "s3://bucket-name/key/to/folder"
            - Example (Local FS): "/tmp/docprompt/storage"
        file_hash (str): The hash of the document.
        **kwargs: Additional keyword arguments for fsspec FileSystem

        DocumentNode: The loaded document node.

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes, metadata_bytes, page_metadata_bytes =
        file_hash, **kwargs

    doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
    node = cls.from_document(doc)

    if metadata_bytes:
        metadata_json = json.loads(metadata_bytes.decode("utf-8"))
        metadata = cls.metadata_class().from_owner(node, **metadata_json)
        metadata = cls.metadata_class().from_owner(node, **{})

    if page_metadata_bytes:
        page_metadata_json = [
            for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
        page_metadata = [
            cls.page_metadata_class()(**page) for page in page_metadata_json
        page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

    # Store the metadata on the node and page nodes
    node.metadata = metadata
    for page, meta in zip(node.page_nodes, page_metadata):
        page.metadata = meta

    # Make sure to set the persistance path on the node
    node.persistance_path = path

    return node
metadata_class() classmethod

Get the metadata class for instantiating metadata from the model.

Source code in docprompt/schema/pipeline/node/
def metadata_class(cls) -> Type[BaseMetadata]:
    """Get the metadata class for instantiating metadata from the model."""

    fields = cls.model_fields

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    metadata_field_annotation = fields["metadata"].annotation

    # If no override has been provided to the metadata model, we want to retrieve
    # it as a TypedDict
    if metadata_field_annotation == DocumentNodeMetadata:
        return BaseMetadata

    if isinstance(metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."

    # Get the overriden Generic type of th DocumentNodeMetadata
    return metadata_field_annotation
page_metadata_class() classmethod

Get the metadata class for the page nodes in the document.

Source code in docprompt/schema/pipeline/node/
def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
    """Get the metadata class for the page nodes in the document."""
    fields = cls.model_fields

    # NOTE: The indexing is important here, and it allows us to get the type of each
    # page node in the `List` annotation
    page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    page_node_metadata_field_annotation = page_nodes_field_class.model_fields[

    if page_node_metadata_field_annotation == PageNodeMetadata:
        return BaseMetadata

    if isinstance(page_node_metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define PageNode with a ForwardRef for Generic metadata model types."

    return page_node_metadata_field_annotation
persist(path=None, **kwargs)

Persist a document node to storage.


Name Type Description Default
path Optional[str]

Overwrites the current persistance_path property - If persistance_path is not currently set, path must be provided.


Additional keyword arguments for fsspec FileSystem



Name Type Description
FileSidecarsPathManager FileSidecarsPathManager

The file path manager for the persisted document node.

Source code in docprompt/schema/pipeline/node/
def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
    """Persist a document node to storage.

        path (Optional[str]): Overwrites the current `persistance_path` property
            - If `persistance_path` is not currently set, path must be provided.
        **kwargs: Additional keyword arguments for fsspec FileSystem

        FileSidecarsPathManager: The file path manager for the persisted document node.

    path = path or self.persistance_path

    if path is None:
        raise ValueError("The path must be provided to persist the document node.")

    # Make sure to update the persistance path
    self.persistance_path = path

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes = self.document.get_bytes()
    metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
    page_metadata_bytes = bytes(
        json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),

    return fs_manager.write(
        pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs

Refreshes the locator for this document node

Source code in docprompt/schema/pipeline/node/
def refresh_locator(self):
    Refreshes the locator for this document node
    from import DocumentProvenanceLocator

    if any(not page.ocr_results.result for page in self.page_nodes):
        raise ValueError(
            "Cannot create a locator for a document node with missing OCR results"

    self._locator = DocumentProvenanceLocator.from_document_node(self)

    return self.locator



Bases: BaseNode, Generic[PageNodeMetadata]

Represents a single page in a document, with some metadata

Source code in docprompt/schema/pipeline/node/
class PageNode(BaseNode, Generic[PageNodeMetadata]):
    Represents a single page in a document, with some metadata

    document: "DocumentNode" = Field(exclude=True, repr=False)
    page_number: PositiveInt = Field(description="The page number")
    metadata: PageNodeMetadata = Field(
        description="Application-specific metadata for the page",
    extra: Dict[str, Any] = Field(
        description="Extra data that can be stored on the page node",

    ocr_results: ResultContainer[OcrPageResult] = Field(
        description="The OCR results for the page",

    _raster_cache: Dict[str, bytes] = PrivateAttr(default_factory=dict)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_raster_cache"] = {}

        return state

    def rasterizer(self):
        return PageRasterizer(self._raster_cache, self)

    def search(
        self, query: str, refine_to_words: bool = True, require_exact_match: bool = True




Source code in docprompt/schema/pipeline/
class DocumentRasterizer:
    def __init__(self, owner: "DocumentNode"):
        self.owner = owner

    def rasterize(
        name: str,
        return_mode: Literal["bytes", "pil"] = "bytes",
        dpi: int = 100,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        max_file_size_bytes: Optional[int] = None,
        render_grayscale: bool = False,
    ) -> List[Union[bytes, Image.Image]]:
        images = self.owner.document.rasterize_pdf(

        for page_number, image in images.items():
            page_node = self.owner.page_nodes[page_number - 1]

            page_node._raster_cache[name] = image

        return list(images.values())

    def propagate_cache(self, name: str, rasters: Dict[int, Union[bytes, Image.Image]]):
        Should be one-indexed
        for page_number, raster in rasters.items():
            page_node = self.owner.page_nodes[page_number - 1]

            page_node._raster_cache[name] = raster
propagate_cache(name, rasters)

Should be one-indexed

Source code in docprompt/schema/pipeline/
def propagate_cache(self, name: str, rasters: Dict[int, Union[bytes, Image.Image]]):
    Should be one-indexed
    for page_number, raster in rasters.items():
        page_node = self.owner.page_nodes[page_number - 1]

        page_node._raster_cache[name] = raster