Skip to content

Index

document

PdfDocument

Bases: BaseModel

Represents a PDF document

Source code in docprompt/schema/document.py
class PdfDocument(BaseModel):
    """
    Represents a PDF document
    """

    name: str = Field(description="The name of the document")
    file_bytes: bytes = Field(description="The bytes of the document", repr=False)
    file_path: Optional[str] = None

    password: Optional[SecretStr] = None

    def __len__(self):
        return self.num_pages

    def __hash__(self):
        return hash(self.document_hash)

    @computed_field
    @cached_property
    def page_count(self) -> PositiveInt:
        from docprompt.utils.util import get_page_count

        return get_page_count(self.file_bytes)

    @property
    def num_pages(self):
        return self.page_count

    @property
    def bytes_per_page(self):
        return len(self.file_bytes) / self.num_pages

    @computed_field
    @cached_property
    def document_hash(self) -> str:
        from docprompt.utils.util import hash_from_bytes

        return hash_from_bytes(self.file_bytes)

    @field_serializer("file_bytes")
    def serialize_file_bytes(self, v: bytes, _info):
        compressed = gzip.compress(v)

        return base64.b64encode(compressed).decode("utf-8")

    @field_validator("file_bytes")
    def validate_file_bytes(cls, v: bytes):
        if not isinstance(v, bytes):
            raise ValueError("File bytes must be bytes")

        if len(v) == 0:
            raise ValueError("File bytes must not be empty")

        if filetype.guess_mime(v) == "text/plain":
            v = base64.b64decode(v, validate=True)

        if filetype.guess_mime(v) == "application/gzip":
            v = gzip.decompress(v)

        if filetype.guess_mime(v) != "application/pdf":
            raise ValueError("File bytes must be a PDF")

        return v

    @classmethod
    def from_path(cls, file_path: Union[PathLike, str]):
        file_path = Path(file_path)

        if not file_path.is_file():
            raise ValueError(f"File path {file_path} is not a file")

        file_bytes = file_path.read_bytes()

        return cls(name=file_path.name, file_path=str(file_path), file_bytes=file_bytes)

    @classmethod
    def from_bytes(cls, file_bytes: bytes, name: Optional[str] = None):
        if name is None:
            name = f"PDF-{datetime.now().isoformat()}.pdf"

        return cls(name=name, file_bytes=file_bytes)

    def get_bytes(self) -> bytes:
        return self.file_bytes  # Deprecated

    @property
    def path(self):
        return self.file_path

    def get_page_render_size(
        self, page_number: int, dpi: int = DEFAULT_DPI
    ) -> Tuple[int, int]:
        """
        Returns the render size of a page in pixels
        """
        return get_page_render_size_from_bytes(self.get_bytes(), page_number, dpi=dpi)

    def to_compressed_bytes(self, compression_kwargs: dict = {}) -> bytes:
        """
        Compresses the document using Ghostscript
        """
        with self.as_tempfile() as temp_path:
            return compress_pdf_to_bytes(temp_path, **compression_kwargs)

    def rasterize_page(
        self,
        page_number: int,
        *,
        dpi: int = DEFAULT_DPI,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        max_file_size_bytes: Optional[int] = None,
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        return_mode: Literal["pil", "bytes"] = "bytes",
    ):
        """
        Rasterizes a page of the document using Pdfium
        """
        if page_number <= 0 or page_number > self.num_pages:
            raise ValueError(f"Page number must be between 0 and {self.num_pages}")

        post_process_fn = None

        if any(
            (
                downscale_size,
                max_file_size_bytes,
                resize_aspect_ratios,
                do_convert,
                do_quantize,
            )
        ):
            post_process_fn = partial(
                process_raster_image,
                resize_width=downscale_size[0] if downscale_size else None,
                resize_height=downscale_size[1] if downscale_size else None,
                resize_mode=resize_mode,
                resize_aspect_ratios=resize_aspect_ratios,
                do_convert=do_convert,
                image_convert_mode=image_convert_mode,
                do_quantize=do_quantize,
                quantize_color_count=quantize_color_count,
                max_file_size_bytes=max_file_size_bytes,
            )

        rastered = rasterize_page_with_pdfium(
            self.file_bytes,
            page_number,
            return_mode=return_mode,
            post_process_fn=post_process_fn,
            scale=(1 / 72) * dpi,
        )

        return rastered

    def rasterize_page_to_data_uri(
        self,
        page_number: int,
        *,
        dpi: int = DEFAULT_DPI,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        max_file_size_bytes: Optional[int] = None,
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        render_grayscale: bool = False,
    ) -> str:
        """
        Rasterizes a page of the document using Pdfium and returns a data URI, which can
        be embedded into HTML or passed to large language models
        """
        image_bytes = self.rasterize_page(
            page_number,
            dpi=dpi,
            downscale_size=downscale_size,
            do_convert=do_convert,
            image_convert_mode=image_convert_mode,
            do_quantize=do_quantize,
            quantize_color_count=quantize_color_count,
            resize_mode=resize_mode,
            max_file_size_bytes=max_file_size_bytes,
            resize_aspect_ratios=resize_aspect_ratios,
            return_mode="bytes",
        )
        return f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"

    def rasterize_pdf(
        self,
        dpi: int = DEFAULT_DPI,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        max_file_size_bytes: Optional[int] = None,
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        return_mode: Literal["pil", "bytes"] = "bytes",
        render_grayscale: bool = False,
    ) -> Dict[int, bytes]:
        """
        Rasterizes the entire document using Pdfium
        """
        result = {}

        post_process_fn = None

        if any(
            (
                downscale_size,
                max_file_size_bytes,
                resize_aspect_ratios,
                do_convert,
                do_quantize,
            )
        ):
            post_process_fn = partial(
                process_raster_image,
                resize_width=downscale_size[0] if downscale_size else None,
                resize_height=downscale_size[1] if downscale_size else None,
                resize_mode=resize_mode,
                resize_aspect_ratios=resize_aspect_ratios,
                do_convert=do_convert,
                image_convert_mode=image_convert_mode,
                do_quantize=do_quantize,
                quantize_color_count=quantize_color_count,
                max_file_size_bytes=max_file_size_bytes,
            )

        for idx, rastered in enumerate(
            rasterize_pdf_with_pdfium(
                self.file_bytes,
                scale=(1 / 72) * dpi,
                grayscale=render_grayscale,
                return_mode=return_mode,
                post_process_fn=post_process_fn,
            )
        ):
            result[idx + 1] = rastered

        return result

    def split(self, start: Optional[int] = None, stop: Optional[int] = None):
        """
        Splits a document into multiple documents
        """
        if start is None and stop is None:
            raise ValueError("Must specify either start or stop")

        start = start or 0

        from docprompt.utils.splitter import split_pdf_to_bytes

        split_bytes = split_pdf_to_bytes(
            self.file_bytes, start_page=start, stop_page=stop
        )

        return Document.from_bytes(split_bytes, name=self.name)

    def as_tempfile(self, **kwargs):
        """
        Returns a tempfile of the document
        """

        @contextmanager
        def tempfile_context() -> Generator[str, None, None]:
            tempfile_kwargs = {"mode": "wb", "delete": True, "suffix": ".pdf", **kwargs}

            with tempfile.NamedTemporaryFile(**tempfile_kwargs) as f:
                f.write(self.file_bytes)
                f.flush()
                yield f.name

        return tempfile_context()

    def write_to_path(self, path: Union[PathLike, str], **kwargs):
        """
        Writes the document to a path
        """
        path = Path(path)

        if path.is_dir():
            path = path / self.name

        with path.open("wb") as f:
            f.write(self.file_bytes)

as_tempfile(**kwargs)

Returns a tempfile of the document

Source code in docprompt/schema/document.py
def as_tempfile(self, **kwargs):
    """
    Returns a tempfile of the document
    """

    @contextmanager
    def tempfile_context() -> Generator[str, None, None]:
        tempfile_kwargs = {"mode": "wb", "delete": True, "suffix": ".pdf", **kwargs}

        with tempfile.NamedTemporaryFile(**tempfile_kwargs) as f:
            f.write(self.file_bytes)
            f.flush()
            yield f.name

    return tempfile_context()

get_page_render_size(page_number, dpi=DEFAULT_DPI)

Returns the render size of a page in pixels

Source code in docprompt/schema/document.py
def get_page_render_size(
    self, page_number: int, dpi: int = DEFAULT_DPI
) -> Tuple[int, int]:
    """
    Returns the render size of a page in pixels
    """
    return get_page_render_size_from_bytes(self.get_bytes(), page_number, dpi=dpi)

rasterize_page(page_number, *, dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, return_mode='bytes')

Rasterizes a page of the document using Pdfium

Source code in docprompt/schema/document.py
def rasterize_page(
    self,
    page_number: int,
    *,
    dpi: int = DEFAULT_DPI,
    downscale_size: Optional[Tuple[int, int]] = None,
    resize_mode: ResizeModes = "thumbnail",
    max_file_size_bytes: Optional[int] = None,
    resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
    do_convert: bool = False,
    image_convert_mode: str = "L",
    do_quantize: bool = False,
    quantize_color_count: int = 8,
    return_mode: Literal["pil", "bytes"] = "bytes",
):
    """
    Rasterizes a page of the document using Pdfium
    """
    if page_number <= 0 or page_number > self.num_pages:
        raise ValueError(f"Page number must be between 0 and {self.num_pages}")

    post_process_fn = None

    if any(
        (
            downscale_size,
            max_file_size_bytes,
            resize_aspect_ratios,
            do_convert,
            do_quantize,
        )
    ):
        post_process_fn = partial(
            process_raster_image,
            resize_width=downscale_size[0] if downscale_size else None,
            resize_height=downscale_size[1] if downscale_size else None,
            resize_mode=resize_mode,
            resize_aspect_ratios=resize_aspect_ratios,
            do_convert=do_convert,
            image_convert_mode=image_convert_mode,
            do_quantize=do_quantize,
            quantize_color_count=quantize_color_count,
            max_file_size_bytes=max_file_size_bytes,
        )

    rastered = rasterize_page_with_pdfium(
        self.file_bytes,
        page_number,
        return_mode=return_mode,
        post_process_fn=post_process_fn,
        scale=(1 / 72) * dpi,
    )

    return rastered

rasterize_page_to_data_uri(page_number, *, dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, render_grayscale=False)

Rasterizes a page of the document using Pdfium and returns a data URI, which can be embedded into HTML or passed to large language models

Source code in docprompt/schema/document.py
def rasterize_page_to_data_uri(
    self,
    page_number: int,
    *,
    dpi: int = DEFAULT_DPI,
    downscale_size: Optional[Tuple[int, int]] = None,
    resize_mode: ResizeModes = "thumbnail",
    max_file_size_bytes: Optional[int] = None,
    resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
    do_convert: bool = False,
    image_convert_mode: str = "L",
    do_quantize: bool = False,
    quantize_color_count: int = 8,
    render_grayscale: bool = False,
) -> str:
    """
    Rasterizes a page of the document using Pdfium and returns a data URI, which can
    be embedded into HTML or passed to large language models
    """
    image_bytes = self.rasterize_page(
        page_number,
        dpi=dpi,
        downscale_size=downscale_size,
        do_convert=do_convert,
        image_convert_mode=image_convert_mode,
        do_quantize=do_quantize,
        quantize_color_count=quantize_color_count,
        resize_mode=resize_mode,
        max_file_size_bytes=max_file_size_bytes,
        resize_aspect_ratios=resize_aspect_ratios,
        return_mode="bytes",
    )
    return f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"

rasterize_pdf(dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, return_mode='bytes', render_grayscale=False)

Rasterizes the entire document using Pdfium

Source code in docprompt/schema/document.py
def rasterize_pdf(
    self,
    dpi: int = DEFAULT_DPI,
    downscale_size: Optional[Tuple[int, int]] = None,
    resize_mode: ResizeModes = "thumbnail",
    max_file_size_bytes: Optional[int] = None,
    resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
    do_convert: bool = False,
    image_convert_mode: str = "L",
    do_quantize: bool = False,
    quantize_color_count: int = 8,
    return_mode: Literal["pil", "bytes"] = "bytes",
    render_grayscale: bool = False,
) -> Dict[int, bytes]:
    """
    Rasterizes the entire document using Pdfium
    """
    result = {}

    post_process_fn = None

    if any(
        (
            downscale_size,
            max_file_size_bytes,
            resize_aspect_ratios,
            do_convert,
            do_quantize,
        )
    ):
        post_process_fn = partial(
            process_raster_image,
            resize_width=downscale_size[0] if downscale_size else None,
            resize_height=downscale_size[1] if downscale_size else None,
            resize_mode=resize_mode,
            resize_aspect_ratios=resize_aspect_ratios,
            do_convert=do_convert,
            image_convert_mode=image_convert_mode,
            do_quantize=do_quantize,
            quantize_color_count=quantize_color_count,
            max_file_size_bytes=max_file_size_bytes,
        )

    for idx, rastered in enumerate(
        rasterize_pdf_with_pdfium(
            self.file_bytes,
            scale=(1 / 72) * dpi,
            grayscale=render_grayscale,
            return_mode=return_mode,
            post_process_fn=post_process_fn,
        )
    ):
        result[idx + 1] = rastered

    return result

split(start=None, stop=None)

Splits a document into multiple documents

Source code in docprompt/schema/document.py
def split(self, start: Optional[int] = None, stop: Optional[int] = None):
    """
    Splits a document into multiple documents
    """
    if start is None and stop is None:
        raise ValueError("Must specify either start or stop")

    start = start or 0

    from docprompt.utils.splitter import split_pdf_to_bytes

    split_bytes = split_pdf_to_bytes(
        self.file_bytes, start_page=start, stop_page=stop
    )

    return Document.from_bytes(split_bytes, name=self.name)

to_compressed_bytes(compression_kwargs={})

Compresses the document using Ghostscript

Source code in docprompt/schema/document.py
def to_compressed_bytes(self, compression_kwargs: dict = {}) -> bytes:
    """
    Compresses the document using Ghostscript
    """
    with self.as_tempfile() as temp_path:
        return compress_pdf_to_bytes(temp_path, **compression_kwargs)

write_to_path(path, **kwargs)

Writes the document to a path

Source code in docprompt/schema/document.py
def write_to_path(self, path: Union[PathLike, str], **kwargs):
    """
    Writes the document to a path
    """
    path = Path(path)

    if path.is_dir():
        path = path / self.name

    with path.open("wb") as f:
        f.write(self.file_bytes)

get_page_render_size_from_bytes(file_bytes, page_number, dpi=DEFAULT_DPI)

Returns the render size of a page in pixels

Source code in docprompt/schema/document.py
def get_page_render_size_from_bytes(
    file_bytes: bytes, page_number: int, dpi: int = DEFAULT_DPI
):
    """
    Returns the render size of a page in pixels
    """

    with get_pdfium_document(file_bytes) as pdf:
        page = pdf.get_page(page_number)

        mediabox = page.get_mediabox()

        base_width = int(mediabox[2] - mediabox[0])
        base_height = int(mediabox[3] - mediabox[1])

        width = int(base_width * dpi / 72)
        height = int(base_height * dpi / 72)

        return width, height

layout

BoundingPoly

Bases: BaseModel

Represents a normalized bounding poly with each value in the range [0, 1]

Used for higher order shapes like polygons on a page

Source code in docprompt/schema/layout.py
class BoundingPoly(BaseModel):
    """
    Represents a normalized bounding poly with each value in the range [0, 1]

    Used for higher order shapes like polygons on a page
    """

    normalized_vertices: List[Point]

    def __getitem__(self, index):
        return self.normalized_vertices[index]

NormBBox

Bases: BaseModel

Represents a normalized bounding box with each value in the range [0, 1]

Where x1 > x0 and bottom > top

Source code in docprompt/schema/layout.py
class NormBBox(BaseModel):
    """
    Represents a normalized bounding box with each value in the range [0, 1]

    Where x1 > x0 and bottom > top
    """

    x0: BoundedFloat
    top: BoundedFloat
    x1: BoundedFloat
    bottom: BoundedFloat

    model_config: ConfigDict = {"json_encoders": {float: lambda v: round(v, 5)}}

    def as_tuple(self):
        return (self.x0, self.top, self.x1, self.bottom)

    def __getitem__(self, index):
        # Lots of if statements to prevent new allocations
        if index > 3:
            raise IndexError("Index out of range")

        if index == 0:
            return self.x0
        elif index == 1:
            return self.top
        elif index == 2:
            return self.x1
        elif index == 3:
            return self.bottom

    def __eq__(self, other):
        if not isinstance(other, NormBBox):
            return False

        return self.as_tuple() == other.as_tuple()

    def __hash__(self):
        return hash(self.as_tuple())

    def __and__(self, other):
        if not isinstance(other, NormBBox):
            raise TypeError("Can only compute intersection with NormBBox")
        # Compute the intersection of two bounding boxes
        new_x0 = max(self.x0, other.x0)
        new_top = max(self.top, other.top)
        new_x1 = min(self.x1, other.x1)
        new_bottom = min(self.bottom, other.bottom)

        # Check if there is an actual intersection and if the resulting bounding box is valid
        if new_x0 <= new_x1 and new_top <= new_bottom:
            return NormBBox(x0=new_x0, top=new_top, x1=new_x1, bottom=new_bottom)
        else:
            # Return an empty or non-existent bounding box representation
            return None

    def __add__(self, other):
        if not isinstance(other, NormBBox):
            raise TypeError("Can only add NormBBox to NormBBox")

        return NormBBox(
            x0=min(self.x0, other.x0),
            top=min(self.top, other.top),
            x1=max(self.x1, other.x1),
            bottom=max(self.bottom, other.bottom),
        )

    def __contains__(self, other):
        return (
            self.x0 <= other.x0
            and self.top <= other.top
            and self.x1 >= other.x1
            and self.bottom >= other.bottom
        )

    def intersection_over_union(self, other):
        if not isinstance(other, NormBBox):
            raise TypeError("Can only compute IOU with NormBBox")

        # Compute the intersection
        intersection_bbox = self & other

        if intersection_bbox:
            intersection_area = intersection_bbox.area
            union_area = self.area + other.area - intersection_area
            return intersection_area / union_area

        return 0  # No intersection

    def x_overlap(self, other):
        """
        Get the overlap, between 0 and 1, of the x-axis of two bounding boxes
        """
        return max(0, min(self.x1, other.x1) - max(self.x0, other.x0))

    def y_overlap(self, other):
        """
        Get the overlap, between 0 and 1, of the y-axis of two bounding boxes
        """
        return max(0, min(self.bottom, other.bottom) - max(self.top, other.top))

    @classmethod
    def combine(cls, *bboxes: "NormBBox"):
        """
        Combines multiple bounding boxes into a single bounding box
        """
        if len(bboxes) == 0:
            raise ValueError("Must provide at least one bounding box")

        if len(bboxes) == 1:
            return bboxes[0]

        working_bbox = bboxes[0]
        for bbox in bboxes[1:]:
            working_bbox = working_bbox + bbox

        return working_bbox

    @classmethod
    def from_bounding_poly(cls, bounding_poly: "BoundingPoly"):
        """
        Returns a NormBBox from a BoundingPoly
        """
        if len(bounding_poly.normalized_vertices) != 4:
            raise ValueError(
                "BoundingPoly must have 4 vertices for NormBBox conversion"
            )

        (
            top_left,
            top_right,
            bottom_right,
            bottom_left,
        ) = bounding_poly.normalized_vertices

        return cls(
            x0=top_left.x,
            top=top_left.y,
            x1=bottom_right.x,
            bottom=bottom_right.y,
        )

    @property
    def width(self):
        return self.x1 - self.x0

    @property
    def height(self):
        return self.bottom - self.top

    @property
    def area(self):
        return self.width * self.height

    @property
    def centroid(self):
        return (self.x0 + self.x1) / 2, (self.top + self.bottom) / 2

    @property
    def y_center(self):
        return (self.top + self.bottom) / 2

    @property
    def x_center(self):
        return (self.x0 + self.x1) / 2

combine(*bboxes) classmethod

Combines multiple bounding boxes into a single bounding box

Source code in docprompt/schema/layout.py
@classmethod
def combine(cls, *bboxes: "NormBBox"):
    """
    Combines multiple bounding boxes into a single bounding box
    """
    if len(bboxes) == 0:
        raise ValueError("Must provide at least one bounding box")

    if len(bboxes) == 1:
        return bboxes[0]

    working_bbox = bboxes[0]
    for bbox in bboxes[1:]:
        working_bbox = working_bbox + bbox

    return working_bbox

from_bounding_poly(bounding_poly) classmethod

Returns a NormBBox from a BoundingPoly

Source code in docprompt/schema/layout.py
@classmethod
def from_bounding_poly(cls, bounding_poly: "BoundingPoly"):
    """
    Returns a NormBBox from a BoundingPoly
    """
    if len(bounding_poly.normalized_vertices) != 4:
        raise ValueError(
            "BoundingPoly must have 4 vertices for NormBBox conversion"
        )

    (
        top_left,
        top_right,
        bottom_right,
        bottom_left,
    ) = bounding_poly.normalized_vertices

    return cls(
        x0=top_left.x,
        top=top_left.y,
        x1=bottom_right.x,
        bottom=bottom_right.y,
    )

x_overlap(other)

Get the overlap, between 0 and 1, of the x-axis of two bounding boxes

Source code in docprompt/schema/layout.py
def x_overlap(self, other):
    """
    Get the overlap, between 0 and 1, of the x-axis of two bounding boxes
    """
    return max(0, min(self.x1, other.x1) - max(self.x0, other.x0))

y_overlap(other)

Get the overlap, between 0 and 1, of the y-axis of two bounding boxes

Source code in docprompt/schema/layout.py
def y_overlap(self, other):
    """
    Get the overlap, between 0 and 1, of the y-axis of two bounding boxes
    """
    return max(0, min(self.bottom, other.bottom) - max(self.top, other.top))

Point

Bases: BaseModel

Represents a normalized bounding box with each value in the range [0, 1]

Source code in docprompt/schema/layout.py
class Point(BaseModel):
    """
    Represents a normalized bounding box with each value in the range [0, 1]
    """

    model_config: ConfigDict = {"json_encoders": {float: lambda v: round(v, 5)}}

    x: BoundedFloat
    y: BoundedFloat

TextBlock

Bases: BaseModel

Represents a single block of text, with its bounding box. The bounding box is a tuple of (x0, top, x1, bottom) and is normalized to the page size.

Source code in docprompt/schema/layout.py
class TextBlock(BaseModel):
    """
    Represents a single block of text, with its bounding box.
    The bounding box is a tuple of (x0, top, x1, bottom) and
    is normalized to the page size.
    """

    model_config: ConfigDict = {"json_encoders": {float: lambda v: round(v, 5)}}

    text: str
    type: SegmentLevels
    source: TextblockSource = Field(
        default="derived", description="The source of the text block"
    )

    # Layout information
    bounding_box: NormBBox = Field(default=None, repr=False)
    bounding_poly: Optional[BoundingPoly] = Field(default=None, repr=False)
    text_spans: Optional[List[TextSpan]] = Field(default=None, repr=False)

    metadata: Optional[TextBlockMetadata] = Field(default_factory=TextBlockMetadata)

    def __getitem__(self, index):
        return getattr(self, index)

    def __hash__(self):
        return hash((self.text, self.bounding_box.as_tuple()))

    @property
    def confidence(self):
        return self.metadata.confidence

    @property
    def direction(self):
        return self.metadata.direction

pipeline

BaseMetadata

Bases: BaseModel, MutableMapping, Generic[TMetadataOwner]

The base metadata class is utilized for defining a basic yet flexible interface for metadata attached to various fields.

When used out of the box, the metadata class will adobpt dictionary-like behavior. You may easily access different fields of the metadata as if it were a dictionary:

# Instantiate it with any kwargs you like
metadata = BaseMetadata(foo-'bar', cow='moo')

metadata["foo"]  # "bar"
metadata["cow"]  # "moo"

# Update the value of the key
metadata["foo"] = "fighters"

# Set new key-value pairs
metadata['sheep'] = 'baa'

Otherwise, you may sub-class the metadata class in order to create a more strictly typed metadata model. This is useful when you want to enforce a specific structure for your metadata.

class CustomMetadata(BaseMetadata):
    foo: str
    cow: str

# Instantiate it with the required fields
metadata = CustomMetadata(foo='bar', cow='moo')

metadata.foo  # "bar"
metadata.cow  # "moo"

# Update the value of the key
metadata.foo = "fighters"

# Use the extra field to store dynamic metadata
metadata.extra['sheep'] = 'baa'

Additionally, the task results descriptor allows for controlled and easy access to the task results of various tasks that are run on the parent node.

Source code in docprompt/schema/pipeline/metadata.py
class BaseMetadata(BaseModel, MutableMapping, Generic[TMetadataOwner]):
    """
    The base metadata class is utilized for defining a basic yet flexible interface
    for metadata attached to various fields.

    The metadata class can be used in two ways:
        1. As a dictionary-like object, where metadata is stored in the `extra` field.
        2. As a sub-classed model, where metadata is stored in the fields of the model.

    When used out of the box, the metadata class will adobpt dictionary-like behavior. You
    may easily access different fields of the metadata as if it were a dictionary:
    ```python
    # Instantiate it with any kwargs you like
    metadata = BaseMetadata(foo-'bar', cow='moo')

    metadata["foo"]  # "bar"
    metadata["cow"]  # "moo"

    # Update the value of the key
    metadata["foo"] = "fighters"

    # Set new key-value pairs
    metadata['sheep'] = 'baa'
    ```

    Otherwise, you may sub-class the metadata class in order to create a more strictly typed
    metadata model. This is useful when you want to enforce a specific structure for your metadata.

    ```python
    class CustomMetadata(BaseMetadata):
        foo: str
        cow: str

    # Instantiate it with the required fields
    metadata = CustomMetadata(foo='bar', cow='moo')

    metadata.foo  # "bar"
    metadata.cow  # "moo"

    # Update the value of the key
    metadata.foo = "fighters"

    # Use the extra field to store dynamic metadata
    metadata.extra['sheep'] = 'baa'
    ```

    Additionally, the task results descriptor allows for controlled and easy access to the task results
    of various tasks that are run on the parent node.
    """

    extra: Dict[str, Any] = Field(..., default_factory=dict, repr=False)

    _task_results: TaskResultsDescriptor = PrivateAttr(
        default_factory=TaskResultsDescriptor
    )

    _owner: TMetadataOwner = PrivateAttr()

    @property
    def task_results(self) -> TaskResultsDescriptor:
        """Return the task results descriptor."""
        return self._task_results.__get__(self)

    @task_results.setter
    def task_results(self, value: Any) -> None:
        """This will raise an error, as we do not want to set the task results directly.

        NOTE: This implementation is here purely to avoid the task_results property from being
        overwritten by accident.
        """
        self._task_results.__set__(self, value)

    @property
    def owner(self) -> TMetadataOwner:
        """Return the owner of the metadata."""
        return self._owner

    @owner.setter
    def owner(self, owner: TMetadataOwner) -> None:
        """Return the owner of the metadata."""
        self._owner = owner

    @classmethod
    def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
        """Create a new instance of the metadata class with the owner set."""
        metadata = cls(**data)
        metadata.owner = owner
        return metadata

    @model_validator(mode="before")
    @classmethod
    def validate_data_fields_from_annotations(cls, data: Any) -> Any:
        """Validate the data fields from the annotations."""

        # We want to make sure that we combine the `extra` metdata along with any
        # other specific fields that are defined in the metadata.
        extra = data.pop("extra", {})
        assert isinstance(extra, dict), "The `extra` field must be a dictionary."
        data = {**data, **extra}

        # If the model has been sub-classed, then all of our fields must be
        # validated by the pydantic model.
        if cls._is_field_typed():
            # We will get the fields out of extra and set them as potential fields to
            # validate. They will be ignored if they are not defined in the model, but it
            # allows for a more flexible way to define metadata.
            # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
            return {**data, "extra": extra}

        # Otherwise, we are using our mock-dict implentation, so we store our
        # metadata in the `extra` field.
        return {"extra": data}

    @classmethod
    def _is_field_typed(cls):
        """
        Check if the metadata model is field typed.

        This is used to determine if the metadata model is a dictionary-like model,
        or a more strictly typed model.
        """
        if set(["extra"]) != set(cls.model_fields.keys()):
            return True

        return False

    def __repr__(self):
        """
        Provide a string representation of the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __repr__ method.
        """
        if self._is_field_typed():
            return super().__repr__()

        # Otherwise, we are deailing with dictornary-like metadata
        return json.dumps(self.extra)

    def __getitem__(self, name):
        """
        Provide dictionary functionlaity to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __getitem__ method.
        """
        if not self._is_field_typed():
            return self.extra[name]

        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

    def __setitem__(self, name, value):
        """
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __setitem__ method.
        """
        if not self._is_field_typed():
            self.extra[name] = value
        else:
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"
            )

    def __delitem__(self, name):
        """
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __delitem__ method.
        """
        if not self._is_field_typed():
            del self.extra[name]
        else:
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"
            )

    def __iter__(self):
        """
        Iterate over the keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __iter__ method.
        """
        if self._is_field_typed():
            raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

        return iter(self.extra)

    def __len__(self):
        """
        Get the number of keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __len__ method.
        """
        if self._is_field_typed():
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '__len__'"
            )

        return len(self.extra)

    def __getattr__(self, name):
        """
        Allow for getting of attributes on the metadata class.

        The attributes are retrieved through the following heirarchy:
            - If the model is sub-classed, it will be retrieved as normal.
            - Otherwise, if the attribute is private, it will be retrieved as normal.
            - Finally, if we are getting a public attribute on the base metadata class,
                we use the extra field.
            - If the key is not set in the `extra` dictionary, we resort back to just
            trying to get the field.
                - This is when we grab the `owner` or `task_result` attribuite.
        """
        if self._is_field_typed():
            return super().__getattr__(name)

        if name.startswith("_"):
            return super().__getattr__(name)

        # Attempt to retreieve the attr from the `extra` field
        try:
            return self.extra.get(name)

        except KeyError:
            # This is for grabbing properties on the base metadata class
            return super().__getattr__(name)

    def __setattr__(self, name: str, value: Any) -> None:
        """
        Allow for setting of attributes on the metadata class.

        The attributes are set through the following heirarchy:
            - If the model is sub-classed, it will be set as normal.
            - Otherwise, if the attribute is private, it will be set as normal.
            - Finally, if we are setting a public attribute on the base metadata class,
                we use the extra field.
        """
        if self._is_field_typed():
            return super().__setattr__(name, value)

        # We want to avoid setting any private attributes in the extra
        # dictionary
        if name.startswith("_"):
            return super().__setattr__(name, value)

        # If it is `owner` or `task_results`, we want
        # to avoid setting the attribute in the `extra` dictionary
        if name in ["owner", "task_results"]:
            return super().__setattr__(name, value)

        self.extra[name] = value

    def __delattr__(self, name: str) -> None:
        """
        Ensure that we can delete attributes from the metadata class.

        The attributes are deleted through the following heirarchy:
            - If the attribute is `task_results`, we use the descriptor to delete the task results.
            - Otherwise, if it is a sub-classed model, it will be deleted as normal.
            - Finally, if we are deleting a public attribute on the base metadata class,
                we use the extra field.
        """

        # We want to use the descriptor to delete the task results
        if name == "task_results":
            self._task_results.__delete__(self)
            return

        # Otherwise, we use our standard fallback tiers
        if self._is_field_typed():
            return super().__delattr__(name)

        del self.extra[name]

owner: TMetadataOwner property writable

Return the owner of the metadata.

task_results: TaskResultsDescriptor property writable

Return the task results descriptor.

__delattr__(name)

Ensure that we can delete attributes from the metadata class.

The attributes are deleted through the following heirarchy
  • If the attribute is task_results, we use the descriptor to delete the task results.
  • Otherwise, if it is a sub-classed model, it will be deleted as normal.
  • Finally, if we are deleting a public attribute on the base metadata class, we use the extra field.
Source code in docprompt/schema/pipeline/metadata.py
def __delattr__(self, name: str) -> None:
    """
    Ensure that we can delete attributes from the metadata class.

    The attributes are deleted through the following heirarchy:
        - If the attribute is `task_results`, we use the descriptor to delete the task results.
        - Otherwise, if it is a sub-classed model, it will be deleted as normal.
        - Finally, if we are deleting a public attribute on the base metadata class,
            we use the extra field.
    """

    # We want to use the descriptor to delete the task results
    if name == "task_results":
        self._task_results.__delete__(self)
        return

    # Otherwise, we use our standard fallback tiers
    if self._is_field_typed():
        return super().__delattr__(name)

    del self.extra[name]

__delitem__(name)

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an delitem method.

Source code in docprompt/schema/pipeline/metadata.py
def __delitem__(self, name):
    """
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __delitem__ method.
    """
    if not self._is_field_typed():
        del self.extra[name]
    else:
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

__getattr__(name)

Allow for getting of attributes on the metadata class.

The attributes are retrieved through the following heirarchy
  • If the model is sub-classed, it will be retrieved as normal.
  • Otherwise, if the attribute is private, it will be retrieved as normal.
  • Finally, if we are getting a public attribute on the base metadata class, we use the extra field.
  • If the key is not set in the extra dictionary, we resort back to just trying to get the field.
    • This is when we grab the owner or task_result attribuite.
Source code in docprompt/schema/pipeline/metadata.py
def __getattr__(self, name):
    """
    Allow for getting of attributes on the metadata class.

    The attributes are retrieved through the following heirarchy:
        - If the model is sub-classed, it will be retrieved as normal.
        - Otherwise, if the attribute is private, it will be retrieved as normal.
        - Finally, if we are getting a public attribute on the base metadata class,
            we use the extra field.
        - If the key is not set in the `extra` dictionary, we resort back to just
        trying to get the field.
            - This is when we grab the `owner` or `task_result` attribuite.
    """
    if self._is_field_typed():
        return super().__getattr__(name)

    if name.startswith("_"):
        return super().__getattr__(name)

    # Attempt to retreieve the attr from the `extra` field
    try:
        return self.extra.get(name)

    except KeyError:
        # This is for grabbing properties on the base metadata class
        return super().__getattr__(name)

__getitem__(name)

Provide dictionary functionlaity to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an getitem method.

Source code in docprompt/schema/pipeline/metadata.py
def __getitem__(self, name):
    """
    Provide dictionary functionlaity to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __getitem__ method.
    """
    if not self._is_field_typed():
        return self.extra[name]

    raise AttributeError(
        f"'{self.__class__.__name__}' object has no attribute '{name}'"
    )

__iter__()

Iterate over the keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an iter method.

Source code in docprompt/schema/pipeline/metadata.py
def __iter__(self):
    """
    Iterate over the keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __iter__ method.
    """
    if self._is_field_typed():
        raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

    return iter(self.extra)

__len__()

Get the number of keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a len method.

Source code in docprompt/schema/pipeline/metadata.py
def __len__(self):
    """
    Get the number of keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __len__ method.
    """
    if self._is_field_typed():
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '__len__'"
        )

    return len(self.extra)

__repr__()

Provide a string representation of the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a repr method.

Source code in docprompt/schema/pipeline/metadata.py
def __repr__(self):
    """
    Provide a string representation of the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __repr__ method.
    """
    if self._is_field_typed():
        return super().__repr__()

    # Otherwise, we are deailing with dictornary-like metadata
    return json.dumps(self.extra)

__setattr__(name, value)

Allow for setting of attributes on the metadata class.

The attributes are set through the following heirarchy
  • If the model is sub-classed, it will be set as normal.
  • Otherwise, if the attribute is private, it will be set as normal.
  • Finally, if we are setting a public attribute on the base metadata class, we use the extra field.
Source code in docprompt/schema/pipeline/metadata.py
def __setattr__(self, name: str, value: Any) -> None:
    """
    Allow for setting of attributes on the metadata class.

    The attributes are set through the following heirarchy:
        - If the model is sub-classed, it will be set as normal.
        - Otherwise, if the attribute is private, it will be set as normal.
        - Finally, if we are setting a public attribute on the base metadata class,
            we use the extra field.
    """
    if self._is_field_typed():
        return super().__setattr__(name, value)

    # We want to avoid setting any private attributes in the extra
    # dictionary
    if name.startswith("_"):
        return super().__setattr__(name, value)

    # If it is `owner` or `task_results`, we want
    # to avoid setting the attribute in the `extra` dictionary
    if name in ["owner", "task_results"]:
        return super().__setattr__(name, value)

    self.extra[name] = value

__setitem__(name, value)

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an setitem method.

Source code in docprompt/schema/pipeline/metadata.py
def __setitem__(self, name, value):
    """
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __setitem__ method.
    """
    if not self._is_field_typed():
        self.extra[name] = value
    else:
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

from_owner(owner, **data) classmethod

Create a new instance of the metadata class with the owner set.

Source code in docprompt/schema/pipeline/metadata.py
@classmethod
def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
    """Create a new instance of the metadata class with the owner set."""
    metadata = cls(**data)
    metadata.owner = owner
    return metadata

validate_data_fields_from_annotations(data) classmethod

Validate the data fields from the annotations.

Source code in docprompt/schema/pipeline/metadata.py
@model_validator(mode="before")
@classmethod
def validate_data_fields_from_annotations(cls, data: Any) -> Any:
    """Validate the data fields from the annotations."""

    # We want to make sure that we combine the `extra` metdata along with any
    # other specific fields that are defined in the metadata.
    extra = data.pop("extra", {})
    assert isinstance(extra, dict), "The `extra` field must be a dictionary."
    data = {**data, **extra}

    # If the model has been sub-classed, then all of our fields must be
    # validated by the pydantic model.
    if cls._is_field_typed():
        # We will get the fields out of extra and set them as potential fields to
        # validate. They will be ignored if they are not defined in the model, but it
        # allows for a more flexible way to define metadata.
        # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
        return {**data, "extra": extra}

    # Otherwise, we are using our mock-dict implentation, so we store our
    # metadata in the `extra` field.
    return {"extra": data}

DocumentCollection

Bases: BaseModel, Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata]

Represents a collection of documents with some common metadata

Source code in docprompt/schema/pipeline/node/collection.py
class DocumentCollection(
    BaseModel,
    Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata],
):
    """
    Represents a collection of documents with some common metadata
    """

    document_nodes: List["DocumentNode[DocumentNodeMetadata, PageNodeMetadata]"]
    metadata: DocumentCollectionMetadata = Field(..., default_factory=dict)

DocumentNode

Bases: BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]

Represents a single document, with some metadata

Source code in docprompt/schema/pipeline/node/document.py
class DocumentNode(BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]):
    """
    Represents a single document, with some metadata
    """

    document: PdfDocument
    page_nodes: List[PageNode[PageNodeMetadata]] = Field(
        description="The pages in the document", default_factory=list, repr=False
    )
    metadata: DocumentNodeMetadata = Field(
        description="Application-specific metadata for the document",
        default_factory=BaseMetadata,
    )

    _locator: Optional["DocumentProvenanceLocator"] = PrivateAttr(default=None)

    _persistance_path: Optional[str] = PrivateAttr(default=None)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_locator"] = None

        return state

    def __len__(self):
        return len(self.page_nodes)

    def __getitem__(self, index):
        return self.page_nodes[index]

    def __iter__(self):
        return iter(self.page_nodes)

    @property
    def rasterizer(self):
        return DocumentRasterizer(self)

    @property
    def locator(self):
        if self._locator is None:
            self.refresh_locator()

        return self._locator

    def refresh_locator(self):
        """
        Refreshes the locator for this document node
        """
        from docprompt.provenance.search import DocumentProvenanceLocator

        if any(not page.ocr_results.result for page in self.page_nodes):
            raise ValueError(
                "Cannot create a locator for a document node with missing OCR results"
            )

        self._locator = DocumentProvenanceLocator.from_document_node(self)

        return self.locator

    @classmethod
    def from_document(
        cls,
        document: PdfDocument,
        document_metadata: Optional[DocumentNodeMetadata] = None,
        page_metadata: Optional[List[PageNodeMetadata]] = None,
    ):
        document_node: "DocumentNode[DocumentNodeMetadata, PageNodeMetadata]" = cls(
            document=document,
        )
        document_node.metadata = document_metadata or cls.metadata_class().from_owner(
            document_node, **{}
        )

        if page_metadata is not None and len(page_metadata) != len(document):
            raise ValueError(
                "The number of page metadata items must match the number of pages in the document."
            )

        for page_number in range(1, len(document) + 1):
            if page_metadata is not None:
                page_node = PageNode(
                    document=document_node,
                    page_number=page_number,
                    metadata=page_metadata[page_number - 1],
                )
            else:
                page_node = PageNode(document=document_node, page_number=page_number)

            document_node.page_nodes.append(page_node)

        return document_node

    @property
    def file_hash(self):
        return self.document.document_hash

    @property
    def document_name(self):
        return self.document.name

    @classmethod
    def metadata_class(cls) -> Type[BaseMetadata]:
        """Get the metadata class for instantiating metadata from the model."""

        fields = cls.model_fields

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        metadata_field_annotation = fields["metadata"].annotation

        # If no override has been provided to the metadata model, we want to retrieve
        # it as a TypedDict
        if metadata_field_annotation == DocumentNodeMetadata:
            return BaseMetadata

        if isinstance(metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
            )

        # Get the overriden Generic type of th DocumentNodeMetadata
        return metadata_field_annotation

    @classmethod
    def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
        """Get the metadata class for the page nodes in the document."""
        fields = cls.model_fields

        # NOTE: The indexing is important here, and it allows us to get the type of each
        # page node in the `List` annotation
        page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
            "metadata"
        ].annotation

        if page_node_metadata_field_annotation == PageNodeMetadata:
            return BaseMetadata

        if isinstance(page_node_metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define PageNode with a ForwardRef for Generic metadata model types."
            )

        return page_node_metadata_field_annotation

    @property
    def persistance_path(self):
        """The base path to storage location."""
        return self._persistance_path

    @persistance_path.setter
    def persistance_path(self, path: str):
        """Set the base path to storage location."""
        self._persistance_path = path

    @classmethod
    def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
        """Load the document node from storage.

        Args:
            path (str): The base path to storage location.
                - Example (S3): "s3://bucket-name/key/to/folder"
                - Example (Local FS): "/tmp/docprompt/storage"
            file_hash (str): The hash of the document.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            DocumentNode: The loaded document node.
        """

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
            file_hash, **kwargs
        )

        doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
        node = cls.from_document(doc)

        if metadata_bytes:
            metadata_json = json.loads(metadata_bytes.decode("utf-8"))
            metadata = cls.metadata_class().from_owner(node, **metadata_json)
        else:
            metadata = cls.metadata_class().from_owner(node, **{})

        if page_metadata_bytes:
            page_metadata_json = [
                json.loads(page_str)
                for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
            ]
            page_metadata = [
                cls.page_metadata_class()(**page) for page in page_metadata_json
            ]
        else:
            page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

        # Store the metadata on the node and page nodes
        node.metadata = metadata
        for page, meta in zip(node.page_nodes, page_metadata):
            meta.set_owner(page)
            page.metadata = meta

        # Make sure to set the persistance path on the node
        node.persistance_path = path

        return node

    def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
        """Persist a document node to storage.

        Args:
            path (Optional[str]): Overwrites the current `persistance_path` property
                - If `persistance_path` is not currently set, path must be provided.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            FileSidecarsPathManager: The file path manager for the persisted document node.
        """

        path = path or self.persistance_path

        if path is None:
            raise ValueError("The path must be provided to persist the document node.")

        # Make sure to update the persistance path
        self.persistance_path = path

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes = self.document.get_bytes()
        metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
        page_metadata_bytes = bytes(
            json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
            encoding="utf-8",
        )

        return fs_manager.write(
            pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
        )

persistance_path property writable

The base path to storage location.

from_storage(path, file_hash, **kwargs) classmethod

Load the document node from storage.

Parameters:

Name Type Description Default
path str

The base path to storage location. - Example (S3): "s3://bucket-name/key/to/folder" - Example (Local FS): "/tmp/docprompt/storage"

required
file_hash str

The hash of the document.

required
**kwargs

Additional keyword arguments for fsspec FileSystem

{}

Returns:

Name Type Description
DocumentNode Self

The loaded document node.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
    """Load the document node from storage.

    Args:
        path (str): The base path to storage location.
            - Example (S3): "s3://bucket-name/key/to/folder"
            - Example (Local FS): "/tmp/docprompt/storage"
        file_hash (str): The hash of the document.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        DocumentNode: The loaded document node.
    """

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
        file_hash, **kwargs
    )

    doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
    node = cls.from_document(doc)

    if metadata_bytes:
        metadata_json = json.loads(metadata_bytes.decode("utf-8"))
        metadata = cls.metadata_class().from_owner(node, **metadata_json)
    else:
        metadata = cls.metadata_class().from_owner(node, **{})

    if page_metadata_bytes:
        page_metadata_json = [
            json.loads(page_str)
            for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
        ]
        page_metadata = [
            cls.page_metadata_class()(**page) for page in page_metadata_json
        ]
    else:
        page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

    # Store the metadata on the node and page nodes
    node.metadata = metadata
    for page, meta in zip(node.page_nodes, page_metadata):
        meta.set_owner(page)
        page.metadata = meta

    # Make sure to set the persistance path on the node
    node.persistance_path = path

    return node

metadata_class() classmethod

Get the metadata class for instantiating metadata from the model.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def metadata_class(cls) -> Type[BaseMetadata]:
    """Get the metadata class for instantiating metadata from the model."""

    fields = cls.model_fields

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    metadata_field_annotation = fields["metadata"].annotation

    # If no override has been provided to the metadata model, we want to retrieve
    # it as a TypedDict
    if metadata_field_annotation == DocumentNodeMetadata:
        return BaseMetadata

    if isinstance(metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
        )

    # Get the overriden Generic type of th DocumentNodeMetadata
    return metadata_field_annotation

page_metadata_class() classmethod

Get the metadata class for the page nodes in the document.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
    """Get the metadata class for the page nodes in the document."""
    fields = cls.model_fields

    # NOTE: The indexing is important here, and it allows us to get the type of each
    # page node in the `List` annotation
    page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
        "metadata"
    ].annotation

    if page_node_metadata_field_annotation == PageNodeMetadata:
        return BaseMetadata

    if isinstance(page_node_metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define PageNode with a ForwardRef for Generic metadata model types."
        )

    return page_node_metadata_field_annotation

persist(path=None, **kwargs)

Persist a document node to storage.

Parameters:

Name Type Description Default
path Optional[str]

Overwrites the current persistance_path property - If persistance_path is not currently set, path must be provided.

None
**kwargs

Additional keyword arguments for fsspec FileSystem

{}

Returns:

Name Type Description
FileSidecarsPathManager FileSidecarsPathManager

The file path manager for the persisted document node.

Source code in docprompt/schema/pipeline/node/document.py
def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
    """Persist a document node to storage.

    Args:
        path (Optional[str]): Overwrites the current `persistance_path` property
            - If `persistance_path` is not currently set, path must be provided.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        FileSidecarsPathManager: The file path manager for the persisted document node.
    """

    path = path or self.persistance_path

    if path is None:
        raise ValueError("The path must be provided to persist the document node.")

    # Make sure to update the persistance path
    self.persistance_path = path

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes = self.document.get_bytes()
    metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
    page_metadata_bytes = bytes(
        json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
        encoding="utf-8",
    )

    return fs_manager.write(
        pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
    )

refresh_locator()

Refreshes the locator for this document node

Source code in docprompt/schema/pipeline/node/document.py
def refresh_locator(self):
    """
    Refreshes the locator for this document node
    """
    from docprompt.provenance.search import DocumentProvenanceLocator

    if any(not page.ocr_results.result for page in self.page_nodes):
        raise ValueError(
            "Cannot create a locator for a document node with missing OCR results"
        )

    self._locator = DocumentProvenanceLocator.from_document_node(self)

    return self.locator

PageNode

Bases: BaseNode, Generic[PageNodeMetadata]

Represents a single page in a document, with some metadata

Source code in docprompt/schema/pipeline/node/page.py
class PageNode(BaseNode, Generic[PageNodeMetadata]):
    """
    Represents a single page in a document, with some metadata
    """

    document: "DocumentNode" = Field(exclude=True, repr=False)
    page_number: PositiveInt = Field(description="The page number")
    metadata: PageNodeMetadata = Field(
        description="Application-specific metadata for the page",
        default_factory=BaseMetadata,
    )
    extra: Dict[str, Any] = Field(
        description="Extra data that can be stored on the page node",
        default_factory=dict,
    )

    ocr_results: ResultContainer[OcrPageResult] = Field(
        default_factory=_result_container_factory,
        description="The OCR results for the page",
        repr=False,
    )

    _raster_cache: Dict[str, bytes] = PrivateAttr(default_factory=dict)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_raster_cache"] = {}

        return state

    @property
    def rasterizer(self):
        return PageRasterizer(self._raster_cache, self)

    def search(
        self, query: str, refine_to_words: bool = True, require_exact_match: bool = True
    ):
        return self.document.locator.search(
            query,
            page_number=self.page_number,
            refine_to_word=refine_to_words,
            require_exact_match=require_exact_match,
        )

metadata

The metadata class is utilized for defining a basic, yet flexible interface for metadata attached to various fields.

In essence, this allows for developers to choose to either create their metadtata in an unstructured manner (i.e. a dictionary), or to sub class the base metadata class in order to create a more strictly typed metadata model for their page and document nodes.

BaseMetadata

Bases: BaseModel, MutableMapping, Generic[TMetadataOwner]

The base metadata class is utilized for defining a basic yet flexible interface for metadata attached to various fields.

When used out of the box, the metadata class will adobpt dictionary-like behavior. You may easily access different fields of the metadata as if it were a dictionary:

# Instantiate it with any kwargs you like
metadata = BaseMetadata(foo-'bar', cow='moo')

metadata["foo"]  # "bar"
metadata["cow"]  # "moo"

# Update the value of the key
metadata["foo"] = "fighters"

# Set new key-value pairs
metadata['sheep'] = 'baa'

Otherwise, you may sub-class the metadata class in order to create a more strictly typed metadata model. This is useful when you want to enforce a specific structure for your metadata.

class CustomMetadata(BaseMetadata):
    foo: str
    cow: str

# Instantiate it with the required fields
metadata = CustomMetadata(foo='bar', cow='moo')

metadata.foo  # "bar"
metadata.cow  # "moo"

# Update the value of the key
metadata.foo = "fighters"

# Use the extra field to store dynamic metadata
metadata.extra['sheep'] = 'baa'

Additionally, the task results descriptor allows for controlled and easy access to the task results of various tasks that are run on the parent node.

Source code in docprompt/schema/pipeline/metadata.py
class BaseMetadata(BaseModel, MutableMapping, Generic[TMetadataOwner]):
    """
    The base metadata class is utilized for defining a basic yet flexible interface
    for metadata attached to various fields.

    The metadata class can be used in two ways:
        1. As a dictionary-like object, where metadata is stored in the `extra` field.
        2. As a sub-classed model, where metadata is stored in the fields of the model.

    When used out of the box, the metadata class will adobpt dictionary-like behavior. You
    may easily access different fields of the metadata as if it were a dictionary:
    ```python
    # Instantiate it with any kwargs you like
    metadata = BaseMetadata(foo-'bar', cow='moo')

    metadata["foo"]  # "bar"
    metadata["cow"]  # "moo"

    # Update the value of the key
    metadata["foo"] = "fighters"

    # Set new key-value pairs
    metadata['sheep'] = 'baa'
    ```

    Otherwise, you may sub-class the metadata class in order to create a more strictly typed
    metadata model. This is useful when you want to enforce a specific structure for your metadata.

    ```python
    class CustomMetadata(BaseMetadata):
        foo: str
        cow: str

    # Instantiate it with the required fields
    metadata = CustomMetadata(foo='bar', cow='moo')

    metadata.foo  # "bar"
    metadata.cow  # "moo"

    # Update the value of the key
    metadata.foo = "fighters"

    # Use the extra field to store dynamic metadata
    metadata.extra['sheep'] = 'baa'
    ```

    Additionally, the task results descriptor allows for controlled and easy access to the task results
    of various tasks that are run on the parent node.
    """

    extra: Dict[str, Any] = Field(..., default_factory=dict, repr=False)

    _task_results: TaskResultsDescriptor = PrivateAttr(
        default_factory=TaskResultsDescriptor
    )

    _owner: TMetadataOwner = PrivateAttr()

    @property
    def task_results(self) -> TaskResultsDescriptor:
        """Return the task results descriptor."""
        return self._task_results.__get__(self)

    @task_results.setter
    def task_results(self, value: Any) -> None:
        """This will raise an error, as we do not want to set the task results directly.

        NOTE: This implementation is here purely to avoid the task_results property from being
        overwritten by accident.
        """
        self._task_results.__set__(self, value)

    @property
    def owner(self) -> TMetadataOwner:
        """Return the owner of the metadata."""
        return self._owner

    @owner.setter
    def owner(self, owner: TMetadataOwner) -> None:
        """Return the owner of the metadata."""
        self._owner = owner

    @classmethod
    def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
        """Create a new instance of the metadata class with the owner set."""
        metadata = cls(**data)
        metadata.owner = owner
        return metadata

    @model_validator(mode="before")
    @classmethod
    def validate_data_fields_from_annotations(cls, data: Any) -> Any:
        """Validate the data fields from the annotations."""

        # We want to make sure that we combine the `extra` metdata along with any
        # other specific fields that are defined in the metadata.
        extra = data.pop("extra", {})
        assert isinstance(extra, dict), "The `extra` field must be a dictionary."
        data = {**data, **extra}

        # If the model has been sub-classed, then all of our fields must be
        # validated by the pydantic model.
        if cls._is_field_typed():
            # We will get the fields out of extra and set them as potential fields to
            # validate. They will be ignored if they are not defined in the model, but it
            # allows for a more flexible way to define metadata.
            # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
            return {**data, "extra": extra}

        # Otherwise, we are using our mock-dict implentation, so we store our
        # metadata in the `extra` field.
        return {"extra": data}

    @classmethod
    def _is_field_typed(cls):
        """
        Check if the metadata model is field typed.

        This is used to determine if the metadata model is a dictionary-like model,
        or a more strictly typed model.
        """
        if set(["extra"]) != set(cls.model_fields.keys()):
            return True

        return False

    def __repr__(self):
        """
        Provide a string representation of the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __repr__ method.
        """
        if self._is_field_typed():
            return super().__repr__()

        # Otherwise, we are deailing with dictornary-like metadata
        return json.dumps(self.extra)

    def __getitem__(self, name):
        """
        Provide dictionary functionlaity to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __getitem__ method.
        """
        if not self._is_field_typed():
            return self.extra[name]

        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

    def __setitem__(self, name, value):
        """
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __setitem__ method.
        """
        if not self._is_field_typed():
            self.extra[name] = value
        else:
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"
            )

    def __delitem__(self, name):
        """
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __delitem__ method.
        """
        if not self._is_field_typed():
            del self.extra[name]
        else:
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"
            )

    def __iter__(self):
        """
        Iterate over the keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __iter__ method.
        """
        if self._is_field_typed():
            raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

        return iter(self.extra)

    def __len__(self):
        """
        Get the number of keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __len__ method.
        """
        if self._is_field_typed():
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '__len__'"
            )

        return len(self.extra)

    def __getattr__(self, name):
        """
        Allow for getting of attributes on the metadata class.

        The attributes are retrieved through the following heirarchy:
            - If the model is sub-classed, it will be retrieved as normal.
            - Otherwise, if the attribute is private, it will be retrieved as normal.
            - Finally, if we are getting a public attribute on the base metadata class,
                we use the extra field.
            - If the key is not set in the `extra` dictionary, we resort back to just
            trying to get the field.
                - This is when we grab the `owner` or `task_result` attribuite.
        """
        if self._is_field_typed():
            return super().__getattr__(name)

        if name.startswith("_"):
            return super().__getattr__(name)

        # Attempt to retreieve the attr from the `extra` field
        try:
            return self.extra.get(name)

        except KeyError:
            # This is for grabbing properties on the base metadata class
            return super().__getattr__(name)

    def __setattr__(self, name: str, value: Any) -> None:
        """
        Allow for setting of attributes on the metadata class.

        The attributes are set through the following heirarchy:
            - If the model is sub-classed, it will be set as normal.
            - Otherwise, if the attribute is private, it will be set as normal.
            - Finally, if we are setting a public attribute on the base metadata class,
                we use the extra field.
        """
        if self._is_field_typed():
            return super().__setattr__(name, value)

        # We want to avoid setting any private attributes in the extra
        # dictionary
        if name.startswith("_"):
            return super().__setattr__(name, value)

        # If it is `owner` or `task_results`, we want
        # to avoid setting the attribute in the `extra` dictionary
        if name in ["owner", "task_results"]:
            return super().__setattr__(name, value)

        self.extra[name] = value

    def __delattr__(self, name: str) -> None:
        """
        Ensure that we can delete attributes from the metadata class.

        The attributes are deleted through the following heirarchy:
            - If the attribute is `task_results`, we use the descriptor to delete the task results.
            - Otherwise, if it is a sub-classed model, it will be deleted as normal.
            - Finally, if we are deleting a public attribute on the base metadata class,
                we use the extra field.
        """

        # We want to use the descriptor to delete the task results
        if name == "task_results":
            self._task_results.__delete__(self)
            return

        # Otherwise, we use our standard fallback tiers
        if self._is_field_typed():
            return super().__delattr__(name)

        del self.extra[name]
owner: TMetadataOwner property writable

Return the owner of the metadata.

task_results: TaskResultsDescriptor property writable

Return the task results descriptor.

__delattr__(name)

Ensure that we can delete attributes from the metadata class.

The attributes are deleted through the following heirarchy
  • If the attribute is task_results, we use the descriptor to delete the task results.
  • Otherwise, if it is a sub-classed model, it will be deleted as normal.
  • Finally, if we are deleting a public attribute on the base metadata class, we use the extra field.
Source code in docprompt/schema/pipeline/metadata.py
def __delattr__(self, name: str) -> None:
    """
    Ensure that we can delete attributes from the metadata class.

    The attributes are deleted through the following heirarchy:
        - If the attribute is `task_results`, we use the descriptor to delete the task results.
        - Otherwise, if it is a sub-classed model, it will be deleted as normal.
        - Finally, if we are deleting a public attribute on the base metadata class,
            we use the extra field.
    """

    # We want to use the descriptor to delete the task results
    if name == "task_results":
        self._task_results.__delete__(self)
        return

    # Otherwise, we use our standard fallback tiers
    if self._is_field_typed():
        return super().__delattr__(name)

    del self.extra[name]
__delitem__(name)

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an delitem method.

Source code in docprompt/schema/pipeline/metadata.py
def __delitem__(self, name):
    """
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __delitem__ method.
    """
    if not self._is_field_typed():
        del self.extra[name]
    else:
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )
__getattr__(name)

Allow for getting of attributes on the metadata class.

The attributes are retrieved through the following heirarchy
  • If the model is sub-classed, it will be retrieved as normal.
  • Otherwise, if the attribute is private, it will be retrieved as normal.
  • Finally, if we are getting a public attribute on the base metadata class, we use the extra field.
  • If the key is not set in the extra dictionary, we resort back to just trying to get the field.
    • This is when we grab the owner or task_result attribuite.
Source code in docprompt/schema/pipeline/metadata.py
def __getattr__(self, name):
    """
    Allow for getting of attributes on the metadata class.

    The attributes are retrieved through the following heirarchy:
        - If the model is sub-classed, it will be retrieved as normal.
        - Otherwise, if the attribute is private, it will be retrieved as normal.
        - Finally, if we are getting a public attribute on the base metadata class,
            we use the extra field.
        - If the key is not set in the `extra` dictionary, we resort back to just
        trying to get the field.
            - This is when we grab the `owner` or `task_result` attribuite.
    """
    if self._is_field_typed():
        return super().__getattr__(name)

    if name.startswith("_"):
        return super().__getattr__(name)

    # Attempt to retreieve the attr from the `extra` field
    try:
        return self.extra.get(name)

    except KeyError:
        # This is for grabbing properties on the base metadata class
        return super().__getattr__(name)
__getitem__(name)

Provide dictionary functionlaity to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an getitem method.

Source code in docprompt/schema/pipeline/metadata.py
def __getitem__(self, name):
    """
    Provide dictionary functionlaity to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __getitem__ method.
    """
    if not self._is_field_typed():
        return self.extra[name]

    raise AttributeError(
        f"'{self.__class__.__name__}' object has no attribute '{name}'"
    )
__iter__()

Iterate over the keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an iter method.

Source code in docprompt/schema/pipeline/metadata.py
def __iter__(self):
    """
    Iterate over the keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __iter__ method.
    """
    if self._is_field_typed():
        raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

    return iter(self.extra)
__len__()

Get the number of keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a len method.

Source code in docprompt/schema/pipeline/metadata.py
def __len__(self):
    """
    Get the number of keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __len__ method.
    """
    if self._is_field_typed():
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '__len__'"
        )

    return len(self.extra)
__repr__()

Provide a string representation of the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a repr method.

Source code in docprompt/schema/pipeline/metadata.py
def __repr__(self):
    """
    Provide a string representation of the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __repr__ method.
    """
    if self._is_field_typed():
        return super().__repr__()

    # Otherwise, we are deailing with dictornary-like metadata
    return json.dumps(self.extra)
__setattr__(name, value)

Allow for setting of attributes on the metadata class.

The attributes are set through the following heirarchy
  • If the model is sub-classed, it will be set as normal.
  • Otherwise, if the attribute is private, it will be set as normal.
  • Finally, if we are setting a public attribute on the base metadata class, we use the extra field.
Source code in docprompt/schema/pipeline/metadata.py
def __setattr__(self, name: str, value: Any) -> None:
    """
    Allow for setting of attributes on the metadata class.

    The attributes are set through the following heirarchy:
        - If the model is sub-classed, it will be set as normal.
        - Otherwise, if the attribute is private, it will be set as normal.
        - Finally, if we are setting a public attribute on the base metadata class,
            we use the extra field.
    """
    if self._is_field_typed():
        return super().__setattr__(name, value)

    # We want to avoid setting any private attributes in the extra
    # dictionary
    if name.startswith("_"):
        return super().__setattr__(name, value)

    # If it is `owner` or `task_results`, we want
    # to avoid setting the attribute in the `extra` dictionary
    if name in ["owner", "task_results"]:
        return super().__setattr__(name, value)

    self.extra[name] = value
__setitem__(name, value)

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an setitem method.

Source code in docprompt/schema/pipeline/metadata.py
def __setitem__(self, name, value):
    """
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __setitem__ method.
    """
    if not self._is_field_typed():
        self.extra[name] = value
    else:
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )
from_owner(owner, **data) classmethod

Create a new instance of the metadata class with the owner set.

Source code in docprompt/schema/pipeline/metadata.py
@classmethod
def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
    """Create a new instance of the metadata class with the owner set."""
    metadata = cls(**data)
    metadata.owner = owner
    return metadata
validate_data_fields_from_annotations(data) classmethod

Validate the data fields from the annotations.

Source code in docprompt/schema/pipeline/metadata.py
@model_validator(mode="before")
@classmethod
def validate_data_fields_from_annotations(cls, data: Any) -> Any:
    """Validate the data fields from the annotations."""

    # We want to make sure that we combine the `extra` metdata along with any
    # other specific fields that are defined in the metadata.
    extra = data.pop("extra", {})
    assert isinstance(extra, dict), "The `extra` field must be a dictionary."
    data = {**data, **extra}

    # If the model has been sub-classed, then all of our fields must be
    # validated by the pydantic model.
    if cls._is_field_typed():
        # We will get the fields out of extra and set them as potential fields to
        # validate. They will be ignored if they are not defined in the model, but it
        # allows for a more flexible way to define metadata.
        # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
        return {**data, "extra": extra}

    # Otherwise, we are using our mock-dict implentation, so we store our
    # metadata in the `extra` field.
    return {"extra": data}

node

DocumentCollection

Bases: BaseModel, Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata]

Represents a collection of documents with some common metadata

Source code in docprompt/schema/pipeline/node/collection.py
class DocumentCollection(
    BaseModel,
    Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata],
):
    """
    Represents a collection of documents with some common metadata
    """

    document_nodes: List["DocumentNode[DocumentNodeMetadata, PageNodeMetadata]"]
    metadata: DocumentCollectionMetadata = Field(..., default_factory=dict)

DocumentNode

Bases: BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]

Represents a single document, with some metadata

Source code in docprompt/schema/pipeline/node/document.py
class DocumentNode(BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]):
    """
    Represents a single document, with some metadata
    """

    document: PdfDocument
    page_nodes: List[PageNode[PageNodeMetadata]] = Field(
        description="The pages in the document", default_factory=list, repr=False
    )
    metadata: DocumentNodeMetadata = Field(
        description="Application-specific metadata for the document",
        default_factory=BaseMetadata,
    )

    _locator: Optional["DocumentProvenanceLocator"] = PrivateAttr(default=None)

    _persistance_path: Optional[str] = PrivateAttr(default=None)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_locator"] = None

        return state

    def __len__(self):
        return len(self.page_nodes)

    def __getitem__(self, index):
        return self.page_nodes[index]

    def __iter__(self):
        return iter(self.page_nodes)

    @property
    def rasterizer(self):
        return DocumentRasterizer(self)

    @property
    def locator(self):
        if self._locator is None:
            self.refresh_locator()

        return self._locator

    def refresh_locator(self):
        """
        Refreshes the locator for this document node
        """
        from docprompt.provenance.search import DocumentProvenanceLocator

        if any(not page.ocr_results.result for page in self.page_nodes):
            raise ValueError(
                "Cannot create a locator for a document node with missing OCR results"
            )

        self._locator = DocumentProvenanceLocator.from_document_node(self)

        return self.locator

    @classmethod
    def from_document(
        cls,
        document: PdfDocument,
        document_metadata: Optional[DocumentNodeMetadata] = None,
        page_metadata: Optional[List[PageNodeMetadata]] = None,
    ):
        document_node: "DocumentNode[DocumentNodeMetadata, PageNodeMetadata]" = cls(
            document=document,
        )
        document_node.metadata = document_metadata or cls.metadata_class().from_owner(
            document_node, **{}
        )

        if page_metadata is not None and len(page_metadata) != len(document):
            raise ValueError(
                "The number of page metadata items must match the number of pages in the document."
            )

        for page_number in range(1, len(document) + 1):
            if page_metadata is not None:
                page_node = PageNode(
                    document=document_node,
                    page_number=page_number,
                    metadata=page_metadata[page_number - 1],
                )
            else:
                page_node = PageNode(document=document_node, page_number=page_number)

            document_node.page_nodes.append(page_node)

        return document_node

    @property
    def file_hash(self):
        return self.document.document_hash

    @property
    def document_name(self):
        return self.document.name

    @classmethod
    def metadata_class(cls) -> Type[BaseMetadata]:
        """Get the metadata class for instantiating metadata from the model."""

        fields = cls.model_fields

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        metadata_field_annotation = fields["metadata"].annotation

        # If no override has been provided to the metadata model, we want to retrieve
        # it as a TypedDict
        if metadata_field_annotation == DocumentNodeMetadata:
            return BaseMetadata

        if isinstance(metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
            )

        # Get the overriden Generic type of th DocumentNodeMetadata
        return metadata_field_annotation

    @classmethod
    def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
        """Get the metadata class for the page nodes in the document."""
        fields = cls.model_fields

        # NOTE: The indexing is important here, and it allows us to get the type of each
        # page node in the `List` annotation
        page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
            "metadata"
        ].annotation

        if page_node_metadata_field_annotation == PageNodeMetadata:
            return BaseMetadata

        if isinstance(page_node_metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define PageNode with a ForwardRef for Generic metadata model types."
            )

        return page_node_metadata_field_annotation

    @property
    def persistance_path(self):
        """The base path to storage location."""
        return self._persistance_path

    @persistance_path.setter
    def persistance_path(self, path: str):
        """Set the base path to storage location."""
        self._persistance_path = path

    @classmethod
    def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
        """Load the document node from storage.

        Args:
            path (str): The base path to storage location.
                - Example (S3): "s3://bucket-name/key/to/folder"
                - Example (Local FS): "/tmp/docprompt/storage"
            file_hash (str): The hash of the document.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            DocumentNode: The loaded document node.
        """

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
            file_hash, **kwargs
        )

        doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
        node = cls.from_document(doc)

        if metadata_bytes:
            metadata_json = json.loads(metadata_bytes.decode("utf-8"))
            metadata = cls.metadata_class().from_owner(node, **metadata_json)
        else:
            metadata = cls.metadata_class().from_owner(node, **{})

        if page_metadata_bytes:
            page_metadata_json = [
                json.loads(page_str)
                for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
            ]
            page_metadata = [
                cls.page_metadata_class()(**page) for page in page_metadata_json
            ]
        else:
            page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

        # Store the metadata on the node and page nodes
        node.metadata = metadata
        for page, meta in zip(node.page_nodes, page_metadata):
            meta.set_owner(page)
            page.metadata = meta

        # Make sure to set the persistance path on the node
        node.persistance_path = path

        return node

    def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
        """Persist a document node to storage.

        Args:
            path (Optional[str]): Overwrites the current `persistance_path` property
                - If `persistance_path` is not currently set, path must be provided.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            FileSidecarsPathManager: The file path manager for the persisted document node.
        """

        path = path or self.persistance_path

        if path is None:
            raise ValueError("The path must be provided to persist the document node.")

        # Make sure to update the persistance path
        self.persistance_path = path

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes = self.document.get_bytes()
        metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
        page_metadata_bytes = bytes(
            json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
            encoding="utf-8",
        )

        return fs_manager.write(
            pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
        )
persistance_path property writable

The base path to storage location.

from_storage(path, file_hash, **kwargs) classmethod

Load the document node from storage.

Parameters:

Name Type Description Default
path str

The base path to storage location. - Example (S3): "s3://bucket-name/key/to/folder" - Example (Local FS): "/tmp/docprompt/storage"

required
file_hash str

The hash of the document.

required
**kwargs

Additional keyword arguments for fsspec FileSystem

{}

Returns:

Name Type Description
DocumentNode Self

The loaded document node.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
    """Load the document node from storage.

    Args:
        path (str): The base path to storage location.
            - Example (S3): "s3://bucket-name/key/to/folder"
            - Example (Local FS): "/tmp/docprompt/storage"
        file_hash (str): The hash of the document.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        DocumentNode: The loaded document node.
    """

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
        file_hash, **kwargs
    )

    doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
    node = cls.from_document(doc)

    if metadata_bytes:
        metadata_json = json.loads(metadata_bytes.decode("utf-8"))
        metadata = cls.metadata_class().from_owner(node, **metadata_json)
    else:
        metadata = cls.metadata_class().from_owner(node, **{})

    if page_metadata_bytes:
        page_metadata_json = [
            json.loads(page_str)
            for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
        ]
        page_metadata = [
            cls.page_metadata_class()(**page) for page in page_metadata_json
        ]
    else:
        page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

    # Store the metadata on the node and page nodes
    node.metadata = metadata
    for page, meta in zip(node.page_nodes, page_metadata):
        meta.set_owner(page)
        page.metadata = meta

    # Make sure to set the persistance path on the node
    node.persistance_path = path

    return node
metadata_class() classmethod

Get the metadata class for instantiating metadata from the model.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def metadata_class(cls) -> Type[BaseMetadata]:
    """Get the metadata class for instantiating metadata from the model."""

    fields = cls.model_fields

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    metadata_field_annotation = fields["metadata"].annotation

    # If no override has been provided to the metadata model, we want to retrieve
    # it as a TypedDict
    if metadata_field_annotation == DocumentNodeMetadata:
        return BaseMetadata

    if isinstance(metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
        )

    # Get the overriden Generic type of th DocumentNodeMetadata
    return metadata_field_annotation
page_metadata_class() classmethod

Get the metadata class for the page nodes in the document.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
    """Get the metadata class for the page nodes in the document."""
    fields = cls.model_fields

    # NOTE: The indexing is important here, and it allows us to get the type of each
    # page node in the `List` annotation
    page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
        "metadata"
    ].annotation

    if page_node_metadata_field_annotation == PageNodeMetadata:
        return BaseMetadata

    if isinstance(page_node_metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define PageNode with a ForwardRef for Generic metadata model types."
        )

    return page_node_metadata_field_annotation
persist(path=None, **kwargs)

Persist a document node to storage.

Parameters:

Name Type Description Default
path Optional[str]

Overwrites the current persistance_path property - If persistance_path is not currently set, path must be provided.

None
**kwargs

Additional keyword arguments for fsspec FileSystem

{}

Returns:

Name Type Description
FileSidecarsPathManager FileSidecarsPathManager

The file path manager for the persisted document node.

Source code in docprompt/schema/pipeline/node/document.py
def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
    """Persist a document node to storage.

    Args:
        path (Optional[str]): Overwrites the current `persistance_path` property
            - If `persistance_path` is not currently set, path must be provided.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        FileSidecarsPathManager: The file path manager for the persisted document node.
    """

    path = path or self.persistance_path

    if path is None:
        raise ValueError("The path must be provided to persist the document node.")

    # Make sure to update the persistance path
    self.persistance_path = path

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes = self.document.get_bytes()
    metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
    page_metadata_bytes = bytes(
        json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
        encoding="utf-8",
    )

    return fs_manager.write(
        pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
    )
refresh_locator()

Refreshes the locator for this document node

Source code in docprompt/schema/pipeline/node/document.py
def refresh_locator(self):
    """
    Refreshes the locator for this document node
    """
    from docprompt.provenance.search import DocumentProvenanceLocator

    if any(not page.ocr_results.result for page in self.page_nodes):
        raise ValueError(
            "Cannot create a locator for a document node with missing OCR results"
        )

    self._locator = DocumentProvenanceLocator.from_document_node(self)

    return self.locator

PageNode

Bases: BaseNode, Generic[PageNodeMetadata]

Represents a single page in a document, with some metadata

Source code in docprompt/schema/pipeline/node/page.py
class PageNode(BaseNode, Generic[PageNodeMetadata]):
    """
    Represents a single page in a document, with some metadata
    """

    document: "DocumentNode" = Field(exclude=True, repr=False)
    page_number: PositiveInt = Field(description="The page number")
    metadata: PageNodeMetadata = Field(
        description="Application-specific metadata for the page",
        default_factory=BaseMetadata,
    )
    extra: Dict[str, Any] = Field(
        description="Extra data that can be stored on the page node",
        default_factory=dict,
    )

    ocr_results: ResultContainer[OcrPageResult] = Field(
        default_factory=_result_container_factory,
        description="The OCR results for the page",
        repr=False,
    )

    _raster_cache: Dict[str, bytes] = PrivateAttr(default_factory=dict)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_raster_cache"] = {}

        return state

    @property
    def rasterizer(self):
        return PageRasterizer(self._raster_cache, self)

    def search(
        self, query: str, refine_to_words: bool = True, require_exact_match: bool = True
    ):
        return self.document.locator.search(
            query,
            page_number=self.page_number,
            refine_to_word=refine_to_words,
            require_exact_match=require_exact_match,
        )

base

BaseNode

Bases: BaseModel

The base node class is utilized for defining a basic yet flexible interface

Source code in docprompt/schema/pipeline/node/base.py
class BaseNode(BaseModel):
    """The base node class is utilized for defining a basic yet flexible interface"""

collection

DocumentCollection

Bases: BaseModel, Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata]

Represents a collection of documents with some common metadata

Source code in docprompt/schema/pipeline/node/collection.py
class DocumentCollection(
    BaseModel,
    Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata],
):
    """
    Represents a collection of documents with some common metadata
    """

    document_nodes: List["DocumentNode[DocumentNodeMetadata, PageNodeMetadata]"]
    metadata: DocumentCollectionMetadata = Field(..., default_factory=dict)

document

DocumentNode

Bases: BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]

Represents a single document, with some metadata

Source code in docprompt/schema/pipeline/node/document.py
class DocumentNode(BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]):
    """
    Represents a single document, with some metadata
    """

    document: PdfDocument
    page_nodes: List[PageNode[PageNodeMetadata]] = Field(
        description="The pages in the document", default_factory=list, repr=False
    )
    metadata: DocumentNodeMetadata = Field(
        description="Application-specific metadata for the document",
        default_factory=BaseMetadata,
    )

    _locator: Optional["DocumentProvenanceLocator"] = PrivateAttr(default=None)

    _persistance_path: Optional[str] = PrivateAttr(default=None)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_locator"] = None

        return state

    def __len__(self):
        return len(self.page_nodes)

    def __getitem__(self, index):
        return self.page_nodes[index]

    def __iter__(self):
        return iter(self.page_nodes)

    @property
    def rasterizer(self):
        return DocumentRasterizer(self)

    @property
    def locator(self):
        if self._locator is None:
            self.refresh_locator()

        return self._locator

    def refresh_locator(self):
        """
        Refreshes the locator for this document node
        """
        from docprompt.provenance.search import DocumentProvenanceLocator

        if any(not page.ocr_results.result for page in self.page_nodes):
            raise ValueError(
                "Cannot create a locator for a document node with missing OCR results"
            )

        self._locator = DocumentProvenanceLocator.from_document_node(self)

        return self.locator

    @classmethod
    def from_document(
        cls,
        document: PdfDocument,
        document_metadata: Optional[DocumentNodeMetadata] = None,
        page_metadata: Optional[List[PageNodeMetadata]] = None,
    ):
        document_node: "DocumentNode[DocumentNodeMetadata, PageNodeMetadata]" = cls(
            document=document,
        )
        document_node.metadata = document_metadata or cls.metadata_class().from_owner(
            document_node, **{}
        )

        if page_metadata is not None and len(page_metadata) != len(document):
            raise ValueError(
                "The number of page metadata items must match the number of pages in the document."
            )

        for page_number in range(1, len(document) + 1):
            if page_metadata is not None:
                page_node = PageNode(
                    document=document_node,
                    page_number=page_number,
                    metadata=page_metadata[page_number - 1],
                )
            else:
                page_node = PageNode(document=document_node, page_number=page_number)

            document_node.page_nodes.append(page_node)

        return document_node

    @property
    def file_hash(self):
        return self.document.document_hash

    @property
    def document_name(self):
        return self.document.name

    @classmethod
    def metadata_class(cls) -> Type[BaseMetadata]:
        """Get the metadata class for instantiating metadata from the model."""

        fields = cls.model_fields

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        metadata_field_annotation = fields["metadata"].annotation

        # If no override has been provided to the metadata model, we want to retrieve
        # it as a TypedDict
        if metadata_field_annotation == DocumentNodeMetadata:
            return BaseMetadata

        if isinstance(metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
            )

        # Get the overriden Generic type of th DocumentNodeMetadata
        return metadata_field_annotation

    @classmethod
    def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
        """Get the metadata class for the page nodes in the document."""
        fields = cls.model_fields

        # NOTE: The indexing is important here, and it allows us to get the type of each
        # page node in the `List` annotation
        page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
            "metadata"
        ].annotation

        if page_node_metadata_field_annotation == PageNodeMetadata:
            return BaseMetadata

        if isinstance(page_node_metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define PageNode with a ForwardRef for Generic metadata model types."
            )

        return page_node_metadata_field_annotation

    @property
    def persistance_path(self):
        """The base path to storage location."""
        return self._persistance_path

    @persistance_path.setter
    def persistance_path(self, path: str):
        """Set the base path to storage location."""
        self._persistance_path = path

    @classmethod
    def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
        """Load the document node from storage.

        Args:
            path (str): The base path to storage location.
                - Example (S3): "s3://bucket-name/key/to/folder"
                - Example (Local FS): "/tmp/docprompt/storage"
            file_hash (str): The hash of the document.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            DocumentNode: The loaded document node.
        """

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
            file_hash, **kwargs
        )

        doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
        node = cls.from_document(doc)

        if metadata_bytes:
            metadata_json = json.loads(metadata_bytes.decode("utf-8"))
            metadata = cls.metadata_class().from_owner(node, **metadata_json)
        else:
            metadata = cls.metadata_class().from_owner(node, **{})

        if page_metadata_bytes:
            page_metadata_json = [
                json.loads(page_str)
                for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
            ]
            page_metadata = [
                cls.page_metadata_class()(**page) for page in page_metadata_json
            ]
        else:
            page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

        # Store the metadata on the node and page nodes
        node.metadata = metadata
        for page, meta in zip(node.page_nodes, page_metadata):
            meta.set_owner(page)
            page.metadata = meta

        # Make sure to set the persistance path on the node
        node.persistance_path = path

        return node

    def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
        """Persist a document node to storage.

        Args:
            path (Optional[str]): Overwrites the current `persistance_path` property
                - If `persistance_path` is not currently set, path must be provided.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            FileSidecarsPathManager: The file path manager for the persisted document node.
        """

        path = path or self.persistance_path

        if path is None:
            raise ValueError("The path must be provided to persist the document node.")

        # Make sure to update the persistance path
        self.persistance_path = path

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes = self.document.get_bytes()
        metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
        page_metadata_bytes = bytes(
            json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
            encoding="utf-8",
        )

        return fs_manager.write(
            pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
        )
persistance_path property writable

The base path to storage location.

from_storage(path, file_hash, **kwargs) classmethod

Load the document node from storage.

Parameters:

Name Type Description Default
path str

The base path to storage location. - Example (S3): "s3://bucket-name/key/to/folder" - Example (Local FS): "/tmp/docprompt/storage"

required
file_hash str

The hash of the document.

required
**kwargs

Additional keyword arguments for fsspec FileSystem

{}

Returns:

Name Type Description
DocumentNode Self

The loaded document node.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
    """Load the document node from storage.

    Args:
        path (str): The base path to storage location.
            - Example (S3): "s3://bucket-name/key/to/folder"
            - Example (Local FS): "/tmp/docprompt/storage"
        file_hash (str): The hash of the document.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        DocumentNode: The loaded document node.
    """

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
        file_hash, **kwargs
    )

    doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
    node = cls.from_document(doc)

    if metadata_bytes:
        metadata_json = json.loads(metadata_bytes.decode("utf-8"))
        metadata = cls.metadata_class().from_owner(node, **metadata_json)
    else:
        metadata = cls.metadata_class().from_owner(node, **{})

    if page_metadata_bytes:
        page_metadata_json = [
            json.loads(page_str)
            for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
        ]
        page_metadata = [
            cls.page_metadata_class()(**page) for page in page_metadata_json
        ]
    else:
        page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

    # Store the metadata on the node and page nodes
    node.metadata = metadata
    for page, meta in zip(node.page_nodes, page_metadata):
        meta.set_owner(page)
        page.metadata = meta

    # Make sure to set the persistance path on the node
    node.persistance_path = path

    return node
metadata_class() classmethod

Get the metadata class for instantiating metadata from the model.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def metadata_class(cls) -> Type[BaseMetadata]:
    """Get the metadata class for instantiating metadata from the model."""

    fields = cls.model_fields

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    metadata_field_annotation = fields["metadata"].annotation

    # If no override has been provided to the metadata model, we want to retrieve
    # it as a TypedDict
    if metadata_field_annotation == DocumentNodeMetadata:
        return BaseMetadata

    if isinstance(metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
        )

    # Get the overriden Generic type of th DocumentNodeMetadata
    return metadata_field_annotation
page_metadata_class() classmethod

Get the metadata class for the page nodes in the document.

Source code in docprompt/schema/pipeline/node/document.py
@classmethod
def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
    """Get the metadata class for the page nodes in the document."""
    fields = cls.model_fields

    # NOTE: The indexing is important here, and it allows us to get the type of each
    # page node in the `List` annotation
    page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
        "metadata"
    ].annotation

    if page_node_metadata_field_annotation == PageNodeMetadata:
        return BaseMetadata

    if isinstance(page_node_metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define PageNode with a ForwardRef for Generic metadata model types."
        )

    return page_node_metadata_field_annotation
persist(path=None, **kwargs)

Persist a document node to storage.

Parameters:

Name Type Description Default
path Optional[str]

Overwrites the current persistance_path property - If persistance_path is not currently set, path must be provided.

None
**kwargs

Additional keyword arguments for fsspec FileSystem

{}

Returns:

Name Type Description
FileSidecarsPathManager FileSidecarsPathManager

The file path manager for the persisted document node.

Source code in docprompt/schema/pipeline/node/document.py
def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
    """Persist a document node to storage.

    Args:
        path (Optional[str]): Overwrites the current `persistance_path` property
            - If `persistance_path` is not currently set, path must be provided.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        FileSidecarsPathManager: The file path manager for the persisted document node.
    """

    path = path or self.persistance_path

    if path is None:
        raise ValueError("The path must be provided to persist the document node.")

    # Make sure to update the persistance path
    self.persistance_path = path

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes = self.document.get_bytes()
    metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
    page_metadata_bytes = bytes(
        json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
        encoding="utf-8",
    )

    return fs_manager.write(
        pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
    )
refresh_locator()

Refreshes the locator for this document node

Source code in docprompt/schema/pipeline/node/document.py
def refresh_locator(self):
    """
    Refreshes the locator for this document node
    """
    from docprompt.provenance.search import DocumentProvenanceLocator

    if any(not page.ocr_results.result for page in self.page_nodes):
        raise ValueError(
            "Cannot create a locator for a document node with missing OCR results"
        )

    self._locator = DocumentProvenanceLocator.from_document_node(self)

    return self.locator

page

PageNode

Bases: BaseNode, Generic[PageNodeMetadata]

Represents a single page in a document, with some metadata

Source code in docprompt/schema/pipeline/node/page.py
class PageNode(BaseNode, Generic[PageNodeMetadata]):
    """
    Represents a single page in a document, with some metadata
    """

    document: "DocumentNode" = Field(exclude=True, repr=False)
    page_number: PositiveInt = Field(description="The page number")
    metadata: PageNodeMetadata = Field(
        description="Application-specific metadata for the page",
        default_factory=BaseMetadata,
    )
    extra: Dict[str, Any] = Field(
        description="Extra data that can be stored on the page node",
        default_factory=dict,
    )

    ocr_results: ResultContainer[OcrPageResult] = Field(
        default_factory=_result_container_factory,
        description="The OCR results for the page",
        repr=False,
    )

    _raster_cache: Dict[str, bytes] = PrivateAttr(default_factory=dict)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_raster_cache"] = {}

        return state

    @property
    def rasterizer(self):
        return PageRasterizer(self._raster_cache, self)

    def search(
        self, query: str, refine_to_words: bool = True, require_exact_match: bool = True
    ):
        return self.document.locator.search(
            query,
            page_number=self.page_number,
            refine_to_word=refine_to_words,
            require_exact_match=require_exact_match,
        )

typing

rasterizer

DocumentRasterizer

Source code in docprompt/schema/pipeline/rasterizer.py
class DocumentRasterizer:
    def __init__(self, owner: "DocumentNode"):
        self.owner = owner

    def rasterize(
        self,
        name: str,
        *,
        return_mode: Literal["bytes", "pil"] = "bytes",
        dpi: int = 100,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        max_file_size_bytes: Optional[int] = None,
        render_grayscale: bool = False,
    ) -> List[Union[bytes, Image.Image]]:
        images = self.owner.document.rasterize_pdf(
            dpi=dpi,
            downscale_size=downscale_size,
            resize_mode=resize_mode,
            resize_aspect_ratios=resize_aspect_ratios,
            do_convert=do_convert,
            image_convert_mode=image_convert_mode,
            do_quantize=do_quantize,
            quantize_color_count=quantize_color_count,
            max_file_size_bytes=max_file_size_bytes,
            render_grayscale=render_grayscale,
            return_mode=return_mode,
        )

        for page_number, image in images.items():
            page_node = self.owner.page_nodes[page_number - 1]

            page_node._raster_cache[name] = image

        return list(images.values())

    def propagate_cache(self, name: str, rasters: Dict[int, Union[bytes, Image.Image]]):
        """
        Should be one-indexed
        """
        for page_number, raster in rasters.items():
            page_node = self.owner.page_nodes[page_number - 1]

            page_node._raster_cache[name] = raster
propagate_cache(name, rasters)

Should be one-indexed

Source code in docprompt/schema/pipeline/rasterizer.py
def propagate_cache(self, name: str, rasters: Dict[int, Union[bytes, Image.Image]]):
    """
    Should be one-indexed
    """
    for page_number, raster in rasters.items():
        page_node = self.owner.page_nodes[page_number - 1]

        page_node._raster_cache[name] = raster