Index

`document`

`PdfDocument`

Bases: BaseModel

Represents a PDF document

Source code in docprompt/schema/document.py

class PdfDocument(BaseModel):
    """
    Represents a PDF document
    """

    name: str = Field(description="The name of the document")
    file_bytes: bytes = Field(description="The bytes of the document", repr=False)
    file_path: Optional[str] = None

    password: Optional[SecretStr] = None

    def __len__(self):
        return self.num_pages

    def __hash__(self):
        return hash(self.document_hash)

    @computed_field
    @cached_property
    def page_count(self) -> PositiveInt:
        from docprompt.utils.util import get_page_count

        return get_page_count(self.file_bytes)

    @property
    def num_pages(self):
        return self.page_count

    @property
    def bytes_per_page(self):
        return len(self.file_bytes) / self.num_pages

    @computed_field
    @cached_property
    def document_hash(self) -> str:
        from docprompt.utils.util import hash_from_bytes

        return hash_from_bytes(self.file_bytes)

    @field_serializer("file_bytes")
    def serialize_file_bytes(self, v: bytes, _info):
        compressed = gzip.compress(v)

        return base64.b64encode(compressed).decode("utf-8")

    @field_validator("file_bytes")
    def validate_file_bytes(cls, v: bytes):
        if not isinstance(v, bytes):
            raise ValueError("File bytes must be bytes")

        if len(v) == 0:
            raise ValueError("File bytes must not be empty")

        if filetype.guess_mime(v) == "text/plain":
            v = base64.b64decode(v, validate=True)

        if filetype.guess_mime(v) == "application/gzip":
            v = gzip.decompress(v)

        if filetype.guess_mime(v) != "application/pdf":
            raise ValueError("File bytes must be a PDF")

        return v

    @classmethod
    def from_path(cls, file_path: Union[PathLike, str]):
        file_path = Path(file_path)

        if not file_path.is_file():
            raise ValueError(f"File path {file_path} is not a file")

        file_bytes = file_path.read_bytes()

        return cls(name=file_path.name, file_path=str(file_path), file_bytes=file_bytes)

    @classmethod
    def from_bytes(cls, file_bytes: bytes, name: Optional[str] = None):
        if name is None:
            name = f"PDF-{datetime.now().isoformat()}.pdf"

        return cls(name=name, file_bytes=file_bytes)

    def get_bytes(self) -> bytes:
        return self.file_bytes  # Deprecated

    @property
    def path(self):
        return self.file_path

    def get_page_render_size(
        self, page_number: int, dpi: int = DEFAULT_DPI
    ) -> Tuple[int, int]:
        """
        Returns the render size of a page in pixels
        """
        return get_page_render_size_from_bytes(self.get_bytes(), page_number, dpi=dpi)

    def to_compressed_bytes(self, compression_kwargs: dict = {}) -> bytes:
        """
        Compresses the document using Ghostscript
        """
        with self.as_tempfile() as temp_path:
            return compress_pdf_to_bytes(temp_path, **compression_kwargs)

    def rasterize_page(
        self,
        page_number: int,
        *,
        dpi: int = DEFAULT_DPI,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        max_file_size_bytes: Optional[int] = None,
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        return_mode: Literal["pil", "bytes"] = "bytes",
    ):
        """
        Rasterizes a page of the document using Pdfium
        """
        if page_number <= 0 or page_number > self.num_pages:
            raise ValueError(f"Page number must be between 0 and {self.num_pages}")

        post_process_fn = None

        if any(
            (
                downscale_size,
                max_file_size_bytes,
                resize_aspect_ratios,
                do_convert,
                do_quantize,
            )
        ):
            post_process_fn = partial(
                process_raster_image,
                resize_width=downscale_size[0] if downscale_size else None,
                resize_height=downscale_size[1] if downscale_size else None,
                resize_mode=resize_mode,
                resize_aspect_ratios=resize_aspect_ratios,
                do_convert=do_convert,
                image_convert_mode=image_convert_mode,
                do_quantize=do_quantize,
                quantize_color_count=quantize_color_count,
                max_file_size_bytes=max_file_size_bytes,
            )

        rastered = rasterize_page_with_pdfium(
            self.file_bytes,
            page_number,
            return_mode=return_mode,
            post_process_fn=post_process_fn,
            scale=(1 / 72) * dpi,
        )

        return rastered

    def rasterize_page_to_data_uri(
        self,
        page_number: int,
        *,
        dpi: int = DEFAULT_DPI,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        max_file_size_bytes: Optional[int] = None,
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        render_grayscale: bool = False,
    ) -> str:
        """
        Rasterizes a page of the document using Pdfium and returns a data URI, which can
        be embedded into HTML or passed to large language models
        """
        image_bytes = self.rasterize_page(
            page_number,
            dpi=dpi,
            downscale_size=downscale_size,
            do_convert=do_convert,
            image_convert_mode=image_convert_mode,
            do_quantize=do_quantize,
            quantize_color_count=quantize_color_count,
            resize_mode=resize_mode,
            max_file_size_bytes=max_file_size_bytes,
            resize_aspect_ratios=resize_aspect_ratios,
            return_mode="bytes",
        )
        return f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"

    def rasterize_pdf(
        self,
        dpi: int = DEFAULT_DPI,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        max_file_size_bytes: Optional[int] = None,
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        return_mode: Literal["pil", "bytes"] = "bytes",
        render_grayscale: bool = False,
    ) -> Dict[int, bytes]:
        """
        Rasterizes the entire document using Pdfium
        """
        result = {}

        post_process_fn = None

        if any(
            (
                downscale_size,
                max_file_size_bytes,
                resize_aspect_ratios,
                do_convert,
                do_quantize,
            )
        ):
            post_process_fn = partial(
                process_raster_image,
                resize_width=downscale_size[0] if downscale_size else None,
                resize_height=downscale_size[1] if downscale_size else None,
                resize_mode=resize_mode,
                resize_aspect_ratios=resize_aspect_ratios,
                do_convert=do_convert,
                image_convert_mode=image_convert_mode,
                do_quantize=do_quantize,
                quantize_color_count=quantize_color_count,
                max_file_size_bytes=max_file_size_bytes,
            )

        for idx, rastered in enumerate(
            rasterize_pdf_with_pdfium(
                self.file_bytes,
                scale=(1 / 72) * dpi,
                grayscale=render_grayscale,
                return_mode=return_mode,
                post_process_fn=post_process_fn,
            )
        ):
            result[idx + 1] = rastered

        return result

    def split(self, start: Optional[int] = None, stop: Optional[int] = None):
        """
        Splits a document into multiple documents
        """
        if start is None and stop is None:
            raise ValueError("Must specify either start or stop")

        start = start or 0

        from docprompt.utils.splitter import split_pdf_to_bytes

        split_bytes = split_pdf_to_bytes(
            self.file_bytes, start_page=start, stop_page=stop
        )

        return Document.from_bytes(split_bytes, name=self.name)

    def as_tempfile(self, **kwargs):
        """
        Returns a tempfile of the document
        """

        @contextmanager
        def tempfile_context() -> Generator[str, None, None]:
            tempfile_kwargs = {"mode": "wb", "delete": True, "suffix": ".pdf", **kwargs}

            with tempfile.NamedTemporaryFile(**tempfile_kwargs) as f:
                f.write(self.file_bytes)
                f.flush()
                yield f.name

        return tempfile_context()

    def write_to_path(self, path: Union[PathLike, str], **kwargs):
        """
        Writes the document to a path
        """
        path = Path(path)

        if path.is_dir():
            path = path / self.name

        with path.open("wb") as f:
            f.write(self.file_bytes)

`as_tempfile(**kwargs)`

Returns a tempfile of the document

Source code in docprompt/schema/document.py

def as_tempfile(self, **kwargs):
    """
    Returns a tempfile of the document
    """

    @contextmanager
    def tempfile_context() -> Generator[str, None, None]:
        tempfile_kwargs = {"mode": "wb", "delete": True, "suffix": ".pdf", **kwargs}

        with tempfile.NamedTemporaryFile(**tempfile_kwargs) as f:
            f.write(self.file_bytes)
            f.flush()
            yield f.name

    return tempfile_context()

`get_page_render_size(page_number, dpi=DEFAULT_DPI)`

Returns the render size of a page in pixels

Source code in docprompt/schema/document.py

def get_page_render_size(
    self, page_number: int, dpi: int = DEFAULT_DPI
) -> Tuple[int, int]:
    """
    Returns the render size of a page in pixels
    """
    return get_page_render_size_from_bytes(self.get_bytes(), page_number, dpi=dpi)

`rasterize_page(page_number, *, dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, return_mode='bytes')`

Rasterizes a page of the document using Pdfium

Source code in docprompt/schema/document.py

def rasterize_page(
    self,
    page_number: int,
    *,
    dpi: int = DEFAULT_DPI,
    downscale_size: Optional[Tuple[int, int]] = None,
    resize_mode: ResizeModes = "thumbnail",
    max_file_size_bytes: Optional[int] = None,
    resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
    do_convert: bool = False,
    image_convert_mode: str = "L",
    do_quantize: bool = False,
    quantize_color_count: int = 8,
    return_mode: Literal["pil", "bytes"] = "bytes",
):
    """
    Rasterizes a page of the document using Pdfium
    """
    if page_number <= 0 or page_number > self.num_pages:
        raise ValueError(f"Page number must be between 0 and {self.num_pages}")

    post_process_fn = None

    if any(
        (
            downscale_size,
            max_file_size_bytes,
            resize_aspect_ratios,
            do_convert,
            do_quantize,
        )
    ):
        post_process_fn = partial(
            process_raster_image,
            resize_width=downscale_size[0] if downscale_size else None,
            resize_height=downscale_size[1] if downscale_size else None,
            resize_mode=resize_mode,
            resize_aspect_ratios=resize_aspect_ratios,
            do_convert=do_convert,
            image_convert_mode=image_convert_mode,
            do_quantize=do_quantize,
            quantize_color_count=quantize_color_count,
            max_file_size_bytes=max_file_size_bytes,
        )

    rastered = rasterize_page_with_pdfium(
        self.file_bytes,
        page_number,
        return_mode=return_mode,
        post_process_fn=post_process_fn,
        scale=(1 / 72) * dpi,
    )

    return rastered

`rasterize_page_to_data_uri(page_number, *, dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, render_grayscale=False)`

Rasterizes a page of the document using Pdfium and returns a data URI, which can be embedded into HTML or passed to large language models

Source code in docprompt/schema/document.py

def rasterize_page_to_data_uri(
    self,
    page_number: int,
    *,
    dpi: int = DEFAULT_DPI,
    downscale_size: Optional[Tuple[int, int]] = None,
    resize_mode: ResizeModes = "thumbnail",
    max_file_size_bytes: Optional[int] = None,
    resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
    do_convert: bool = False,
    image_convert_mode: str = "L",
    do_quantize: bool = False,
    quantize_color_count: int = 8,
    render_grayscale: bool = False,
) -> str:
    """
    Rasterizes a page of the document using Pdfium and returns a data URI, which can
    be embedded into HTML or passed to large language models
    """
    image_bytes = self.rasterize_page(
        page_number,
        dpi=dpi,
        downscale_size=downscale_size,
        do_convert=do_convert,
        image_convert_mode=image_convert_mode,
        do_quantize=do_quantize,
        quantize_color_count=quantize_color_count,
        resize_mode=resize_mode,
        max_file_size_bytes=max_file_size_bytes,
        resize_aspect_ratios=resize_aspect_ratios,
        return_mode="bytes",
    )
    return f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"

`rasterize_pdf(dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, return_mode='bytes', render_grayscale=False)`

Rasterizes the entire document using Pdfium

Source code in docprompt/schema/document.py

def rasterize_pdf(
    self,
    dpi: int = DEFAULT_DPI,
    downscale_size: Optional[Tuple[int, int]] = None,
    resize_mode: ResizeModes = "thumbnail",
    max_file_size_bytes: Optional[int] = None,
    resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
    do_convert: bool = False,
    image_convert_mode: str = "L",
    do_quantize: bool = False,
    quantize_color_count: int = 8,
    return_mode: Literal["pil", "bytes"] = "bytes",
    render_grayscale: bool = False,
) -> Dict[int, bytes]:
    """
    Rasterizes the entire document using Pdfium
    """
    result = {}

    post_process_fn = None

    if any(
        (
            downscale_size,
            max_file_size_bytes,
            resize_aspect_ratios,
            do_convert,
            do_quantize,
        )
    ):
        post_process_fn = partial(
            process_raster_image,
            resize_width=downscale_size[0] if downscale_size else None,
            resize_height=downscale_size[1] if downscale_size else None,
            resize_mode=resize_mode,
            resize_aspect_ratios=resize_aspect_ratios,
            do_convert=do_convert,
            image_convert_mode=image_convert_mode,
            do_quantize=do_quantize,
            quantize_color_count=quantize_color_count,
            max_file_size_bytes=max_file_size_bytes,
        )

    for idx, rastered in enumerate(
        rasterize_pdf_with_pdfium(
            self.file_bytes,
            scale=(1 / 72) * dpi,
            grayscale=render_grayscale,
            return_mode=return_mode,
            post_process_fn=post_process_fn,
        )
    ):
        result[idx + 1] = rastered

    return result

`split(start=None, stop=None)`

Splits a document into multiple documents

Source code in docprompt/schema/document.py

def split(self, start: Optional[int] = None, stop: Optional[int] = None):
    """
    Splits a document into multiple documents
    """
    if start is None and stop is None:
        raise ValueError("Must specify either start or stop")

    start = start or 0

    from docprompt.utils.splitter import split_pdf_to_bytes

    split_bytes = split_pdf_to_bytes(
        self.file_bytes, start_page=start, stop_page=stop
    )

    return Document.from_bytes(split_bytes, name=self.name)

`to_compressed_bytes(compression_kwargs={})`

Compresses the document using Ghostscript

Source code in docprompt/schema/document.py

def to_compressed_bytes(self, compression_kwargs: dict = {}) -> bytes:
    """
    Compresses the document using Ghostscript
    """
    with self.as_tempfile() as temp_path:
        return compress_pdf_to_bytes(temp_path, **compression_kwargs)

`write_to_path(path, **kwargs)`

Writes the document to a path

Source code in docprompt/schema/document.py

def write_to_path(self, path: Union[PathLike, str], **kwargs):
    """
    Writes the document to a path
    """
    path = Path(path)

    if path.is_dir():
        path = path / self.name

    with path.open("wb") as f:
        f.write(self.file_bytes)

`get_page_render_size_from_bytes(file_bytes, page_number, dpi=DEFAULT_DPI)`

Returns the render size of a page in pixels

Source code in docprompt/schema/document.py

def get_page_render_size_from_bytes(
    file_bytes: bytes, page_number: int, dpi: int = DEFAULT_DPI
):
    """
    Returns the render size of a page in pixels
    """

    with get_pdfium_document(file_bytes) as pdf:
        page = pdf.get_page(page_number)

        mediabox = page.get_mediabox()

        base_width = int(mediabox[2] - mediabox[0])
        base_height = int(mediabox[3] - mediabox[1])

        width = int(base_width * dpi / 72)
        height = int(base_height * dpi / 72)

        return width, height

`layout`

`BoundingPoly`

Bases: BaseModel

Represents a normalized bounding poly with each value in the range [0, 1]

Used for higher order shapes like polygons on a page

Source code in docprompt/schema/layout.py

class BoundingPoly(BaseModel):
    """
    Represents a normalized bounding poly with each value in the range [0, 1]

    Used for higher order shapes like polygons on a page
    """

    normalized_vertices: List[Point]

    def __getitem__(self, index):
        return self.normalized_vertices[index]

`NormBBox`

Bases: BaseModel

Represents a normalized bounding box with each value in the range [0, 1]

Where x1 > x0 and bottom > top

Source code in docprompt/schema/layout.py

class NormBBox(BaseModel):
    """
    Represents a normalized bounding box with each value in the range [0, 1]

    Where x1 > x0 and bottom > top
    """

    x0: BoundedFloat
    top: BoundedFloat
    x1: BoundedFloat
    bottom: BoundedFloat

    model_config: ConfigDict = {"json_encoders": {float: lambda v: round(v, 5)}}

    def as_tuple(self):
        return (self.x0, self.top, self.x1, self.bottom)

    def __getitem__(self, index):
        # Lots of if statements to prevent new allocations
        if index > 3:
            raise IndexError("Index out of range")

        if index == 0:
            return self.x0
        elif index == 1:
            return self.top
        elif index == 2:
            return self.x1
        elif index == 3:
            return self.bottom

    def __eq__(self, other):
        if not isinstance(other, NormBBox):
            return False

        return self.as_tuple() == other.as_tuple()

    def __hash__(self):
        return hash(self.as_tuple())

    def __and__(self, other):
        if not isinstance(other, NormBBox):
            raise TypeError("Can only compute intersection with NormBBox")
        # Compute the intersection of two bounding boxes
        new_x0 = max(self.x0, other.x0)
        new_top = max(self.top, other.top)
        new_x1 = min(self.x1, other.x1)
        new_bottom = min(self.bottom, other.bottom)

        # Check if there is an actual intersection and if the resulting bounding box is valid
        if new_x0 <= new_x1 and new_top <= new_bottom:
            return NormBBox(x0=new_x0, top=new_top, x1=new_x1, bottom=new_bottom)
        else:
            # Return an empty or non-existent bounding box representation
            return None

    def __add__(self, other):
        if not isinstance(other, NormBBox):
            raise TypeError("Can only add NormBBox to NormBBox")

        return NormBBox(
            x0=min(self.x0, other.x0),
            top=min(self.top, other.top),
            x1=max(self.x1, other.x1),
            bottom=max(self.bottom, other.bottom),
        )

    def __contains__(self, other):
        return (
            self.x0 <= other.x0
            and self.top <= other.top
            and self.x1 >= other.x1
            and self.bottom >= other.bottom
        )

    def intersection_over_union(self, other):
        if not isinstance(other, NormBBox):
            raise TypeError("Can only compute IOU with NormBBox")

        # Compute the intersection
        intersection_bbox = self & other

        if intersection_bbox:
            intersection_area = intersection_bbox.area
            union_area = self.area + other.area - intersection_area
            return intersection_area / union_area

        return 0  # No intersection

    def x_overlap(self, other):
        """
        Get the overlap, between 0 and 1, of the x-axis of two bounding boxes
        """
        return max(0, min(self.x1, other.x1) - max(self.x0, other.x0))

    def y_overlap(self, other):
        """
        Get the overlap, between 0 and 1, of the y-axis of two bounding boxes
        """
        return max(0, min(self.bottom, other.bottom) - max(self.top, other.top))

    @classmethod
    def combine(cls, *bboxes: "NormBBox"):
        """
        Combines multiple bounding boxes into a single bounding box
        """
        if len(bboxes) == 0:
            raise ValueError("Must provide at least one bounding box")

        if len(bboxes) == 1:
            return bboxes[0]

        working_bbox = bboxes[0]
        for bbox in bboxes[1:]:
            working_bbox = working_bbox + bbox

        return working_bbox

    @classmethod
    def from_bounding_poly(cls, bounding_poly: "BoundingPoly"):
        """
        Returns a NormBBox from a BoundingPoly
        """
        if len(bounding_poly.normalized_vertices) != 4:
            raise ValueError(
                "BoundingPoly must have 4 vertices for NormBBox conversion"
            )

        (
            top_left,
            top_right,
            bottom_right,
            bottom_left,
        ) = bounding_poly.normalized_vertices

        return cls(
            x0=top_left.x,
            top=top_left.y,
            x1=bottom_right.x,
            bottom=bottom_right.y,
        )

    @property
    def width(self):
        return self.x1 - self.x0

    @property
    def height(self):
        return self.bottom - self.top

    @property
    def area(self):
        return self.width * self.height

    @property
    def centroid(self):
        return (self.x0 + self.x1) / 2, (self.top + self.bottom) / 2

    @property
    def y_center(self):
        return (self.top + self.bottom) / 2

    @property
    def x_center(self):
        return (self.x0 + self.x1) / 2

`combine(*bboxes)` `classmethod`

Combines multiple bounding boxes into a single bounding box

Source code in docprompt/schema/layout.py

@classmethod
def combine(cls, *bboxes: "NormBBox"):
    """
    Combines multiple bounding boxes into a single bounding box
    """
    if len(bboxes) == 0:
        raise ValueError("Must provide at least one bounding box")

    if len(bboxes) == 1:
        return bboxes[0]

    working_bbox = bboxes[0]
    for bbox in bboxes[1:]:
        working_bbox = working_bbox + bbox

    return working_bbox

`from_bounding_poly(bounding_poly)` `classmethod`

Returns a NormBBox from a BoundingPoly

Source code in docprompt/schema/layout.py

@classmethod
def from_bounding_poly(cls, bounding_poly: "BoundingPoly"):
    """
    Returns a NormBBox from a BoundingPoly
    """
    if len(bounding_poly.normalized_vertices) != 4:
        raise ValueError(
            "BoundingPoly must have 4 vertices for NormBBox conversion"
        )

    (
        top_left,
        top_right,
        bottom_right,
        bottom_left,
    ) = bounding_poly.normalized_vertices

    return cls(
        x0=top_left.x,
        top=top_left.y,
        x1=bottom_right.x,
        bottom=bottom_right.y,
    )

`x_overlap(other)`

Get the overlap, between 0 and 1, of the x-axis of two bounding boxes

Source code in docprompt/schema/layout.py

def x_overlap(self, other):
    """
    Get the overlap, between 0 and 1, of the x-axis of two bounding boxes
    """
    return max(0, min(self.x1, other.x1) - max(self.x0, other.x0))

`y_overlap(other)`

Get the overlap, between 0 and 1, of the y-axis of two bounding boxes

Source code in docprompt/schema/layout.py

def y_overlap(self, other):
    """
    Get the overlap, between 0 and 1, of the y-axis of two bounding boxes
    """
    return max(0, min(self.bottom, other.bottom) - max(self.top, other.top))

`Point`

Bases: BaseModel

Represents a normalized bounding box with each value in the range [0, 1]

Source code in docprompt/schema/layout.py

class Point(BaseModel):
    """
    Represents a normalized bounding box with each value in the range [0, 1]
    """

    model_config: ConfigDict = {"json_encoders": {float: lambda v: round(v, 5)}}

    x: BoundedFloat
    y: BoundedFloat

`TextBlock`

Bases: BaseModel

Represents a single block of text, with its bounding box. The bounding box is a tuple of (x0, top, x1, bottom) and is normalized to the page size.

Source code in docprompt/schema/layout.py

class TextBlock(BaseModel):
    """
    Represents a single block of text, with its bounding box.
    The bounding box is a tuple of (x0, top, x1, bottom) and
    is normalized to the page size.
    """

    model_config: ConfigDict = {"json_encoders": {float: lambda v: round(v, 5)}}

    text: str
    type: SegmentLevels
    source: TextblockSource = Field(
        default="derived", description="The source of the text block"
    )

    # Layout information
    bounding_box: NormBBox = Field(default=None, repr=False)
    bounding_poly: Optional[BoundingPoly] = Field(default=None, repr=False)
    text_spans: Optional[List[TextSpan]] = Field(default=None, repr=False)

    metadata: Optional[TextBlockMetadata] = Field(default_factory=TextBlockMetadata)

    def __getitem__(self, index):
        return getattr(self, index)

    def __hash__(self):
        return hash((self.text, self.bounding_box.as_tuple()))

    @property
    def confidence(self):
        return self.metadata.confidence

    @property
    def direction(self):
        return self.metadata.direction

`pipeline`

`BaseMetadata`

Bases: BaseModel, MutableMapping, Generic[TMetadataOwner]

The base metadata class is utilized for defining a basic yet flexible interface for metadata attached to various fields.

The metadata class can be used in two ways

As a dictionary-like object, where metadata is stored in the extra field.
As a sub-classed model, where metadata is stored in the fields of the model.

When used out of the box, the metadata class will adobpt dictionary-like behavior. You may easily access different fields of the metadata as if it were a dictionary:

# Instantiate it with any kwargs you like
metadata = BaseMetadata(foo-'bar', cow='moo')

metadata["foo"]  # "bar"
metadata["cow"]  # "moo"

# Update the value of the key
metadata["foo"] = "fighters"

# Set new key-value pairs
metadata['sheep'] = 'baa'

Otherwise, you may sub-class the metadata class in order to create a more strictly typed metadata model. This is useful when you want to enforce a specific structure for your metadata.

class CustomMetadata(BaseMetadata):
    foo: str
    cow: str

# Instantiate it with the required fields
metadata = CustomMetadata(foo='bar', cow='moo')

metadata.foo  # "bar"
metadata.cow  # "moo"

# Update the value of the key
metadata.foo = "fighters"

# Use the extra field to store dynamic metadata
metadata.extra['sheep'] = 'baa'

Additionally, the task results descriptor allows for controlled and easy access to the task results of various tasks that are run on the parent node.

Source code in docprompt/schema/pipeline/metadata.py

class BaseMetadata(BaseModel, MutableMapping, Generic[TMetadataOwner]):
    """
    The base metadata class is utilized for defining a basic yet flexible interface
    for metadata attached to various fields.

    The metadata class can be used in two ways:
        1. As a dictionary-like object, where metadata is stored in the `extra` field.
        2. As a sub-classed model, where metadata is stored in the fields of the model.

    When used out of the box, the metadata class will adobpt dictionary-like behavior. You
    may easily access different fields of the metadata as if it were a dictionary:
    ```python
    # Instantiate it with any kwargs you like
    metadata = BaseMetadata(foo-'bar', cow='moo')

    metadata["foo"]  # "bar"
    metadata["cow"]  # "moo"

    # Update the value of the key
    metadata["foo"] = "fighters"

    # Set new key-value pairs
    metadata['sheep'] = 'baa'
    ```

    Otherwise, you may sub-class the metadata class in order to create a more strictly typed
    metadata model. This is useful when you want to enforce a specific structure for your metadata.

    ```python
    class CustomMetadata(BaseMetadata):
        foo: str
        cow: str

    # Instantiate it with the required fields
    metadata = CustomMetadata(foo='bar', cow='moo')

    metadata.foo  # "bar"
    metadata.cow  # "moo"

    # Update the value of the key
    metadata.foo = "fighters"

    # Use the extra field to store dynamic metadata
    metadata.extra['sheep'] = 'baa'
    ```

    Additionally, the task results descriptor allows for controlled and easy access to the task results
    of various tasks that are run on the parent node.
    """

    extra: Dict[str, Any] = Field(..., default_factory=dict, repr=False)

    _task_results: TaskResultsDescriptor = PrivateAttr(
        default_factory=TaskResultsDescriptor
    )

    _owner: TMetadataOwner = PrivateAttr()

    @property
    def task_results(self) -> TaskResultsDescriptor:
        """Return the task results descriptor."""
        return self._task_results.__get__(self)

    @task_results.setter
    def task_results(self, value: Any) -> None:
        """This will raise an error, as we do not want to set the task results directly.

        NOTE: This implementation is here purely to avoid the task_results property from being
        overwritten by accident.
        """
        self._task_results.__set__(self, value)

    @property
    def owner(self) -> TMetadataOwner:
        """Return the owner of the metadata."""
        return self._owner

    @owner.setter
    def owner(self, owner: TMetadataOwner) -> None:
        """Return the owner of the metadata."""
        self._owner = owner

    @classmethod
    def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
        """Create a new instance of the metadata class with the owner set."""
        metadata = cls(**data)
        metadata.owner = owner
        return metadata

    @model_validator(mode="before")
    @classmethod
    def validate_data_fields_from_annotations(cls, data: Any) -> Any:
        """Validate the data fields from the annotations."""

        # We want to make sure that we combine the `extra` metdata along with any
        # other specific fields that are defined in the metadata.
        extra = data.pop("extra", {})
        assert isinstance(extra, dict), "The `extra` field must be a dictionary."
        data = {**data, **extra}

        # If the model has been sub-classed, then all of our fields must be
        # validated by the pydantic model.
        if cls._is_field_typed():
            # We will get the fields out of extra and set them as potential fields to
            # validate. They will be ignored if they are not defined in the model, but it
            # allows for a more flexible way to define metadata.
            # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
            return {**data, "extra": extra}

        # Otherwise, we are using our mock-dict implentation, so we store our
        # metadata in the `extra` field.
        return {"extra": data}

    @classmethod
    def _is_field_typed(cls):
        """
        Check if the metadata model is field typed.

        This is used to determine if the metadata model is a dictionary-like model,
        or a more strictly typed model.
        """
        if set(["extra"]) != set(cls.model_fields.keys()):
            return True

        return False

    def __repr__(self):
        """
        Provide a string representation of the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __repr__ method.
        """
        if self._is_field_typed():
            return super().__repr__()

        # Otherwise, we are deailing with dictornary-like metadata
        return json.dumps(self.extra)

    def __getitem__(self, name):
        """
        Provide dictionary functionlaity to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __getitem__ method.
        """
        if not self._is_field_typed():
            return self.extra[name]

        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

    def __setitem__(self, name, value):
        """
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __setitem__ method.
        """
        if not self._is_field_typed():
            self.extra[name] = value
        else:
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"
            )

    def __delitem__(self, name):
        """
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __delitem__ method.
        """
        if not self._is_field_typed():
            del self.extra[name]
        else:
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"
            )

    def __iter__(self):
        """
        Iterate over the keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __iter__ method.
        """
        if self._is_field_typed():
            raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

        return iter(self.extra)

    def __len__(self):
        """
        Get the number of keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __len__ method.
        """
        if self._is_field_typed():
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '__len__'"
            )

        return len(self.extra)

    def __getattr__(self, name):
        """
        Allow for getting of attributes on the metadata class.

        The attributes are retrieved through the following heirarchy:
            - If the model is sub-classed, it will be retrieved as normal.
            - Otherwise, if the attribute is private, it will be retrieved as normal.
            - Finally, if we are getting a public attribute on the base metadata class,
                we use the extra field.
            - If the key is not set in the `extra` dictionary, we resort back to just
            trying to get the field.
                - This is when we grab the `owner` or `task_result` attribuite.
        """
        if self._is_field_typed():
            return super().__getattr__(name)

        if name.startswith("_"):
            return super().__getattr__(name)

        # Attempt to retreieve the attr from the `extra` field
        try:
            return self.extra.get(name)

        except KeyError:
            # This is for grabbing properties on the base metadata class
            return super().__getattr__(name)

    def __setattr__(self, name: str, value: Any) -> None:
        """
        Allow for setting of attributes on the metadata class.

        The attributes are set through the following heirarchy:
            - If the model is sub-classed, it will be set as normal.
            - Otherwise, if the attribute is private, it will be set as normal.
            - Finally, if we are setting a public attribute on the base metadata class,
                we use the extra field.
        """
        if self._is_field_typed():
            return super().__setattr__(name, value)

        # We want to avoid setting any private attributes in the extra
        # dictionary
        if name.startswith("_"):
            return super().__setattr__(name, value)

        # If it is `owner` or `task_results`, we want
        # to avoid setting the attribute in the `extra` dictionary
        if name in ["owner", "task_results"]:
            return super().__setattr__(name, value)

        self.extra[name] = value

    def __delattr__(self, name: str) -> None:
        """
        Ensure that we can delete attributes from the metadata class.

        The attributes are deleted through the following heirarchy:
            - If the attribute is `task_results`, we use the descriptor to delete the task results.
            - Otherwise, if it is a sub-classed model, it will be deleted as normal.
            - Finally, if we are deleting a public attribute on the base metadata class,
                we use the extra field.
        """

        # We want to use the descriptor to delete the task results
        if name == "task_results":
            self._task_results.__delete__(self)
            return

        # Otherwise, we use our standard fallback tiers
        if self._is_field_typed():
            return super().__delattr__(name)

        del self.extra[name]

`owner: TMetadataOwner` `property` `writable`

Return the owner of the metadata.

`task_results: TaskResultsDescriptor` `property` `writable`

Return the task results descriptor.

`delattr(name)`

Ensure that we can delete attributes from the metadata class.

The attributes are deleted through the following heirarchy

If the attribute is task_results, we use the descriptor to delete the task results.
Otherwise, if it is a sub-classed model, it will be deleted as normal.
Finally, if we are deleting a public attribute on the base metadata class, we use the extra field.

Source code in docprompt/schema/pipeline/metadata.py

def __delattr__(self, name: str) -> None:
    """
    Ensure that we can delete attributes from the metadata class.

    The attributes are deleted through the following heirarchy:
        - If the attribute is `task_results`, we use the descriptor to delete the task results.
        - Otherwise, if it is a sub-classed model, it will be deleted as normal.
        - Finally, if we are deleting a public attribute on the base metadata class,
            we use the extra field.
    """

    # We want to use the descriptor to delete the task results
    if name == "task_results":
        self._task_results.__delete__(self)
        return

    # Otherwise, we use our standard fallback tiers
    if self._is_field_typed():
        return super().__delattr__(name)

    del self.extra[name]

`delitem(name)`

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an delitem method.

Source code in docprompt/schema/pipeline/metadata.py

def __delitem__(self, name):
    """
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __delitem__ method.
    """
    if not self._is_field_typed():
        del self.extra[name]
    else:
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

`getattr(name)`

Allow for getting of attributes on the metadata class.

The attributes are retrieved through the following heirarchy

If the model is sub-classed, it will be retrieved as normal.
Otherwise, if the attribute is private, it will be retrieved as normal.
Finally, if we are getting a public attribute on the base metadata class, we use the extra field.
If the key is not set in the extra dictionary, we resort back to just trying to get the field.
- This is when we grab the owner or task_result attribuite.

Source code in docprompt/schema/pipeline/metadata.py

def __getattr__(self, name):
    """
    Allow for getting of attributes on the metadata class.

    The attributes are retrieved through the following heirarchy:
        - If the model is sub-classed, it will be retrieved as normal.
        - Otherwise, if the attribute is private, it will be retrieved as normal.
        - Finally, if we are getting a public attribute on the base metadata class,
            we use the extra field.
        - If the key is not set in the `extra` dictionary, we resort back to just
        trying to get the field.
            - This is when we grab the `owner` or `task_result` attribuite.
    """
    if self._is_field_typed():
        return super().__getattr__(name)

    if name.startswith("_"):
        return super().__getattr__(name)

    # Attempt to retreieve the attr from the `extra` field
    try:
        return self.extra.get(name)

    except KeyError:
        # This is for grabbing properties on the base metadata class
        return super().__getattr__(name)

`getitem(name)`

Provide dictionary functionlaity to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an getitem method.

Source code in docprompt/schema/pipeline/metadata.py

def __getitem__(self, name):
    """
    Provide dictionary functionlaity to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __getitem__ method.
    """
    if not self._is_field_typed():
        return self.extra[name]

    raise AttributeError(
        f"'{self.__class__.__name__}' object has no attribute '{name}'"
    )

`iter()`

Iterate over the keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an iter method.

Source code in docprompt/schema/pipeline/metadata.py

def __iter__(self):
    """
    Iterate over the keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __iter__ method.
    """
    if self._is_field_typed():
        raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

    return iter(self.extra)

`len()`

Get the number of keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a len method.

Source code in docprompt/schema/pipeline/metadata.py

def __len__(self):
    """
    Get the number of keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __len__ method.
    """
    if self._is_field_typed():
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '__len__'"
        )

    return len(self.extra)

`repr()`

Provide a string representation of the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a repr method.

Source code in docprompt/schema/pipeline/metadata.py

def __repr__(self):
    """
    Provide a string representation of the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __repr__ method.
    """
    if self._is_field_typed():
        return super().__repr__()

    # Otherwise, we are deailing with dictornary-like metadata
    return json.dumps(self.extra)

`setattr(name, value)`

Allow for setting of attributes on the metadata class.

The attributes are set through the following heirarchy

If the model is sub-classed, it will be set as normal.
Otherwise, if the attribute is private, it will be set as normal.
Finally, if we are setting a public attribute on the base metadata class, we use the extra field.

Source code in docprompt/schema/pipeline/metadata.py

def __setattr__(self, name: str, value: Any) -> None:
    """
    Allow for setting of attributes on the metadata class.

    The attributes are set through the following heirarchy:
        - If the model is sub-classed, it will be set as normal.
        - Otherwise, if the attribute is private, it will be set as normal.
        - Finally, if we are setting a public attribute on the base metadata class,
            we use the extra field.
    """
    if self._is_field_typed():
        return super().__setattr__(name, value)

    # We want to avoid setting any private attributes in the extra
    # dictionary
    if name.startswith("_"):
        return super().__setattr__(name, value)

    # If it is `owner` or `task_results`, we want
    # to avoid setting the attribute in the `extra` dictionary
    if name in ["owner", "task_results"]:
        return super().__setattr__(name, value)

    self.extra[name] = value

`setitem(name, value)`

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an setitem method.

Source code in docprompt/schema/pipeline/metadata.py

def __setitem__(self, name, value):
    """
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __setitem__ method.
    """
    if not self._is_field_typed():
        self.extra[name] = value
    else:
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

`from_owner(owner, **data)` `classmethod`

Create a new instance of the metadata class with the owner set.

Source code in docprompt/schema/pipeline/metadata.py

@classmethod
def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
    """Create a new instance of the metadata class with the owner set."""
    metadata = cls(**data)
    metadata.owner = owner
    return metadata

`validate_data_fields_from_annotations(data)` `classmethod`

Validate the data fields from the annotations.

Source code in docprompt/schema/pipeline/metadata.py

@model_validator(mode="before")
@classmethod
def validate_data_fields_from_annotations(cls, data: Any) -> Any:
    """Validate the data fields from the annotations."""

    # We want to make sure that we combine the `extra` metdata along with any
    # other specific fields that are defined in the metadata.
    extra = data.pop("extra", {})
    assert isinstance(extra, dict), "The `extra` field must be a dictionary."
    data = {**data, **extra}

    # If the model has been sub-classed, then all of our fields must be
    # validated by the pydantic model.
    if cls._is_field_typed():
        # We will get the fields out of extra and set them as potential fields to
        # validate. They will be ignored if they are not defined in the model, but it
        # allows for a more flexible way to define metadata.
        # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
        return {**data, "extra": extra}

    # Otherwise, we are using our mock-dict implentation, so we store our
    # metadata in the `extra` field.
    return {"extra": data}

`DocumentCollection`

Bases: BaseModel, Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata]

Represents a collection of documents with some common metadata

Source code in docprompt/schema/pipeline/node/collection.py

class DocumentCollection(
    BaseModel,
    Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata],
):
    """
    Represents a collection of documents with some common metadata
    """

    document_nodes: List["DocumentNode[DocumentNodeMetadata, PageNodeMetadata]"]
    metadata: DocumentCollectionMetadata = Field(..., default_factory=dict)

`DocumentNode`

Bases: BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]

Represents a single document, with some metadata

Source code in docprompt/schema/pipeline/node/document.py

class DocumentNode(BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]):
    """
    Represents a single document, with some metadata
    """

    document: PdfDocument
    page_nodes: List[PageNode[PageNodeMetadata]] = Field(
        description="The pages in the document", default_factory=list, repr=False
    )
    metadata: DocumentNodeMetadata = Field(
        description="Application-specific metadata for the document",
        default_factory=BaseMetadata,
    )

    _locator: Optional["DocumentProvenanceLocator"] = PrivateAttr(default=None)

    _persistance_path: Optional[str] = PrivateAttr(default=None)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_locator"] = None

        return state

    def __len__(self):
        return len(self.page_nodes)

    def __getitem__(self, index):
        return self.page_nodes[index]

    def __iter__(self):
        return iter(self.page_nodes)

    @property
    def rasterizer(self):
        return DocumentRasterizer(self)

    @property
    def locator(self):
        if self._locator is None:
            self.refresh_locator()

        return self._locator

    def refresh_locator(self):
        """
        Refreshes the locator for this document node
        """
        from docprompt.provenance.search import DocumentProvenanceLocator

        if any(not page.ocr_results.result for page in self.page_nodes):
            raise ValueError(
                "Cannot create a locator for a document node with missing OCR results"
            )

        self._locator = DocumentProvenanceLocator.from_document_node(self)

        return self.locator

    @classmethod
    def from_document(
        cls,
        document: PdfDocument,
        document_metadata: Optional[DocumentNodeMetadata] = None,
        page_metadata: Optional[List[PageNodeMetadata]] = None,
    ):
        document_node: "DocumentNode[DocumentNodeMetadata, PageNodeMetadata]" = cls(
            document=document,
        )
        document_node.metadata = document_metadata or cls.metadata_class().from_owner(
            document_node, **{}
        )

        if page_metadata is not None and len(page_metadata) != len(document):
            raise ValueError(
                "The number of page metadata items must match the number of pages in the document."
            )

        for page_number in range(1, len(document) + 1):
            if page_metadata is not None:
                page_node = PageNode(
                    document=document_node,
                    page_number=page_number,
                    metadata=page_metadata[page_number - 1],
                )
            else:
                page_node = PageNode(document=document_node, page_number=page_number)

            document_node.page_nodes.append(page_node)

        return document_node

    @property
    def file_hash(self):
        return self.document.document_hash

    @property
    def document_name(self):
        return self.document.name

    @classmethod
    def metadata_class(cls) -> Type[BaseMetadata]:
        """Get the metadata class for instantiating metadata from the model."""

        fields = cls.model_fields

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        metadata_field_annotation = fields["metadata"].annotation

        # If no override has been provided to the metadata model, we want to retrieve
        # it as a TypedDict
        if metadata_field_annotation == DocumentNodeMetadata:
            return BaseMetadata

        if isinstance(metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
            )

        # Get the overriden Generic type of th DocumentNodeMetadata
        return metadata_field_annotation

    @classmethod
    def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
        """Get the metadata class for the page nodes in the document."""
        fields = cls.model_fields

        # NOTE: The indexing is important here, and it allows us to get the type of each
        # page node in the `List` annotation
        page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
            "metadata"
        ].annotation

        if page_node_metadata_field_annotation == PageNodeMetadata:
            return BaseMetadata

        if isinstance(page_node_metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define PageNode with a ForwardRef for Generic metadata model types."
            )

        return page_node_metadata_field_annotation

    @property
    def persistance_path(self):
        """The base path to storage location."""
        return self._persistance_path

    @persistance_path.setter
    def persistance_path(self, path: str):
        """Set the base path to storage location."""
        self._persistance_path = path

    @classmethod
    def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
        """Load the document node from storage.

        Args:
            path (str): The base path to storage location.
                - Example (S3): "s3://bucket-name/key/to/folder"
                - Example (Local FS): "/tmp/docprompt/storage"
            file_hash (str): The hash of the document.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            DocumentNode: The loaded document node.
        """

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
            file_hash, **kwargs
        )

        doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
        node = cls.from_document(doc)

        if metadata_bytes:
            metadata_json = json.loads(metadata_bytes.decode("utf-8"))
            metadata = cls.metadata_class().from_owner(node, **metadata_json)
        else:
            metadata = cls.metadata_class().from_owner(node, **{})

        if page_metadata_bytes:
            page_metadata_json = [
                json.loads(page_str)
                for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
            ]
            page_metadata = [
                cls.page_metadata_class()(**page) for page in page_metadata_json
            ]
        else:
            page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

        # Store the metadata on the node and page nodes
        node.metadata = metadata
        for page, meta in zip(node.page_nodes, page_metadata):
            meta.set_owner(page)
            page.metadata = meta

        # Make sure to set the persistance path on the node
        node.persistance_path = path

        return node

    def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
        """Persist a document node to storage.

        Args:
            path (Optional[str]): Overwrites the current `persistance_path` property
                - If `persistance_path` is not currently set, path must be provided.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            FileSidecarsPathManager: The file path manager for the persisted document node.
        """

        path = path or self.persistance_path

        if path is None:
            raise ValueError("The path must be provided to persist the document node.")

        # Make sure to update the persistance path
        self.persistance_path = path

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes = self.document.get_bytes()
        metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
        page_metadata_bytes = bytes(
            json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
            encoding="utf-8",
        )

        return fs_manager.write(
            pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
        )

`persistance_path` `property` `writable`

The base path to storage location.

`from_storage(path, file_hash, **kwargs)` `classmethod`

Load the document node from storage.

Parameters:

Name	Type	Description	Default
`path`	`str`	The base path to storage location. - Example (S3): "s3://bucket-name/key/to/folder" - Example (Local FS): "/tmp/docprompt/storage"	required
`file_hash`	`str`	The hash of the document.	required
`**kwargs`		Additional keyword arguments for fsspec FileSystem	`{}`

Returns:

Name	Type	Description
`DocumentNode`	`Self`	The loaded document node.

Source code in docprompt/schema/pipeline/node/document.py

@classmethod
def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
    """Load the document node from storage.

    Args:
        path (str): The base path to storage location.
            - Example (S3): "s3://bucket-name/key/to/folder"
            - Example (Local FS): "/tmp/docprompt/storage"
        file_hash (str): The hash of the document.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        DocumentNode: The loaded document node.
    """

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
        file_hash, **kwargs
    )

    doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
    node = cls.from_document(doc)

    if metadata_bytes:
        metadata_json = json.loads(metadata_bytes.decode("utf-8"))
        metadata = cls.metadata_class().from_owner(node, **metadata_json)
    else:
        metadata = cls.metadata_class().from_owner(node, **{})

    if page_metadata_bytes:
        page_metadata_json = [
            json.loads(page_str)
            for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
        ]
        page_metadata = [
            cls.page_metadata_class()(**page) for page in page_metadata_json
        ]
    else:
        page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

    # Store the metadata on the node and page nodes
    node.metadata = metadata
    for page, meta in zip(node.page_nodes, page_metadata):
        meta.set_owner(page)
        page.metadata = meta

    # Make sure to set the persistance path on the node
    node.persistance_path = path

    return node

`metadata_class()` `classmethod`

Get the metadata class for instantiating metadata from the model.

Source code in docprompt/schema/pipeline/node/document.py

@classmethod
def metadata_class(cls) -> Type[BaseMetadata]:
    """Get the metadata class for instantiating metadata from the model."""

    fields = cls.model_fields

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    metadata_field_annotation = fields["metadata"].annotation

    # If no override has been provided to the metadata model, we want to retrieve
    # it as a TypedDict
    if metadata_field_annotation == DocumentNodeMetadata:
        return BaseMetadata

    if isinstance(metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
        )

    # Get the overriden Generic type of th DocumentNodeMetadata
    return metadata_field_annotation

`page_metadata_class()` `classmethod`

Get the metadata class for the page nodes in the document.

Source code in docprompt/schema/pipeline/node/document.py

@classmethod
def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
    """Get the metadata class for the page nodes in the document."""
    fields = cls.model_fields

    # NOTE: The indexing is important here, and it allows us to get the type of each
    # page node in the `List` annotation
    page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
        "metadata"
    ].annotation

    if page_node_metadata_field_annotation == PageNodeMetadata:
        return BaseMetadata

    if isinstance(page_node_metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define PageNode with a ForwardRef for Generic metadata model types."
        )

    return page_node_metadata_field_annotation

`persist(path=None, **kwargs)`

Persist a document node to storage.

Parameters:

Name	Type	Description	Default
`path`	`Optional[str]`	Overwrites the current `persistance_path` property - If `persistance_path` is not currently set, path must be provided.	`None`
`**kwargs`		Additional keyword arguments for fsspec FileSystem	`{}`

Returns:

Name	Type	Description
`FileSidecarsPathManager`	`FileSidecarsPathManager`	The file path manager for the persisted document node.

Source code in docprompt/schema/pipeline/node/document.py

def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
    """Persist a document node to storage.

    Args:
        path (Optional[str]): Overwrites the current `persistance_path` property
            - If `persistance_path` is not currently set, path must be provided.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        FileSidecarsPathManager: The file path manager for the persisted document node.
    """

    path = path or self.persistance_path

    if path is None:
        raise ValueError("The path must be provided to persist the document node.")

    # Make sure to update the persistance path
    self.persistance_path = path

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes = self.document.get_bytes()
    metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
    page_metadata_bytes = bytes(
        json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
        encoding="utf-8",
    )

    return fs_manager.write(
        pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
    )

`refresh_locator()`

Refreshes the locator for this document node

Source code in docprompt/schema/pipeline/node/document.py

def refresh_locator(self):
    """
    Refreshes the locator for this document node
    """
    from docprompt.provenance.search import DocumentProvenanceLocator

    if any(not page.ocr_results.result for page in self.page_nodes):
        raise ValueError(
            "Cannot create a locator for a document node with missing OCR results"
        )

    self._locator = DocumentProvenanceLocator.from_document_node(self)

    return self.locator

`PageNode`

Bases: BaseNode, Generic[PageNodeMetadata]

Represents a single page in a document, with some metadata

Source code in docprompt/schema/pipeline/node/page.py

class PageNode(BaseNode, Generic[PageNodeMetadata]):
    """
    Represents a single page in a document, with some metadata
    """

    document: "DocumentNode" = Field(exclude=True, repr=False)
    page_number: PositiveInt = Field(description="The page number")
    metadata: PageNodeMetadata = Field(
        description="Application-specific metadata for the page",
        default_factory=BaseMetadata,
    )
    extra: Dict[str, Any] = Field(
        description="Extra data that can be stored on the page node",
        default_factory=dict,
    )

    ocr_results: ResultContainer[OcrPageResult] = Field(
        default_factory=_result_container_factory,
        description="The OCR results for the page",
        repr=False,
    )

    _raster_cache: Dict[str, bytes] = PrivateAttr(default_factory=dict)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_raster_cache"] = {}

        return state

    @property
    def rasterizer(self):
        return PageRasterizer(self._raster_cache, self)

    def search(
        self, query: str, refine_to_words: bool = True, require_exact_match: bool = True
    ):
        return self.document.locator.search(
            query,
            page_number=self.page_number,
            refine_to_word=refine_to_words,
            require_exact_match=require_exact_match,
        )

`metadata`

The metadata class is utilized for defining a basic, yet flexible interface for metadata attached to various fields.

In essence, this allows for developers to choose to either create their metadtata in an unstructured manner (i.e. a dictionary), or to sub class the base metadata class in order to create a more strictly typed metadata model for their page and document nodes.

`BaseMetadata`

Bases: BaseModel, MutableMapping, Generic[TMetadataOwner]

The base metadata class is utilized for defining a basic yet flexible interface for metadata attached to various fields.

The metadata class can be used in two ways

As a dictionary-like object, where metadata is stored in the extra field.
As a sub-classed model, where metadata is stored in the fields of the model.

When used out of the box, the metadata class will adobpt dictionary-like behavior. You may easily access different fields of the metadata as if it were a dictionary:

# Instantiate it with any kwargs you like
metadata = BaseMetadata(foo-'bar', cow='moo')

metadata["foo"]  # "bar"
metadata["cow"]  # "moo"

# Update the value of the key
metadata["foo"] = "fighters"

# Set new key-value pairs
metadata['sheep'] = 'baa'

Otherwise, you may sub-class the metadata class in order to create a more strictly typed metadata model. This is useful when you want to enforce a specific structure for your metadata.

class CustomMetadata(BaseMetadata):
    foo: str
    cow: str

# Instantiate it with the required fields
metadata = CustomMetadata(foo='bar', cow='moo')

metadata.foo  # "bar"
metadata.cow  # "moo"

# Update the value of the key
metadata.foo = "fighters"

# Use the extra field to store dynamic metadata
metadata.extra['sheep'] = 'baa'

Additionally, the task results descriptor allows for controlled and easy access to the task results of various tasks that are run on the parent node.

Source code in docprompt/schema/pipeline/metadata.py

class BaseMetadata(BaseModel, MutableMapping, Generic[TMetadataOwner]):
    """
    The base metadata class is utilized for defining a basic yet flexible interface
    for metadata attached to various fields.

    The metadata class can be used in two ways:
        1. As a dictionary-like object, where metadata is stored in the `extra` field.
        2. As a sub-classed model, where metadata is stored in the fields of the model.

    When used out of the box, the metadata class will adobpt dictionary-like behavior. You
    may easily access different fields of the metadata as if it were a dictionary:
    ```python
    # Instantiate it with any kwargs you like
    metadata = BaseMetadata(foo-'bar', cow='moo')

    metadata["foo"]  # "bar"
    metadata["cow"]  # "moo"

    # Update the value of the key
    metadata["foo"] = "fighters"

    # Set new key-value pairs
    metadata['sheep'] = 'baa'
    ```

    Otherwise, you may sub-class the metadata class in order to create a more strictly typed
    metadata model. This is useful when you want to enforce a specific structure for your metadata.

    ```python
    class CustomMetadata(BaseMetadata):
        foo: str
        cow: str

    # Instantiate it with the required fields
    metadata = CustomMetadata(foo='bar', cow='moo')

    metadata.foo  # "bar"
    metadata.cow  # "moo"

    # Update the value of the key
    metadata.foo = "fighters"

    # Use the extra field to store dynamic metadata
    metadata.extra['sheep'] = 'baa'
    ```

    Additionally, the task results descriptor allows for controlled and easy access to the task results
    of various tasks that are run on the parent node.
    """

    extra: Dict[str, Any] = Field(..., default_factory=dict, repr=False)

    _task_results: TaskResultsDescriptor = PrivateAttr(
        default_factory=TaskResultsDescriptor
    )

    _owner: TMetadataOwner = PrivateAttr()

    @property
    def task_results(self) -> TaskResultsDescriptor:
        """Return the task results descriptor."""
        return self._task_results.__get__(self)

    @task_results.setter
    def task_results(self, value: Any) -> None:
        """This will raise an error, as we do not want to set the task results directly.

        NOTE: This implementation is here purely to avoid the task_results property from being
        overwritten by accident.
        """
        self._task_results.__set__(self, value)

    @property
    def owner(self) -> TMetadataOwner:
        """Return the owner of the metadata."""
        return self._owner

    @owner.setter
    def owner(self, owner: TMetadataOwner) -> None:
        """Return the owner of the metadata."""
        self._owner = owner

    @classmethod
    def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
        """Create a new instance of the metadata class with the owner set."""
        metadata = cls(**data)
        metadata.owner = owner
        return metadata

    @model_validator(mode="before")
    @classmethod
    def validate_data_fields_from_annotations(cls, data: Any) -> Any:
        """Validate the data fields from the annotations."""

        # We want to make sure that we combine the `extra` metdata along with any
        # other specific fields that are defined in the metadata.
        extra = data.pop("extra", {})
        assert isinstance(extra, dict), "The `extra` field must be a dictionary."
        data = {**data, **extra}

        # If the model has been sub-classed, then all of our fields must be
        # validated by the pydantic model.
        if cls._is_field_typed():
            # We will get the fields out of extra and set them as potential fields to
            # validate. They will be ignored if they are not defined in the model, but it
            # allows for a more flexible way to define metadata.
            # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
            return {**data, "extra": extra}

        # Otherwise, we are using our mock-dict implentation, so we store our
        # metadata in the `extra` field.
        return {"extra": data}

    @classmethod
    def _is_field_typed(cls):
        """
        Check if the metadata model is field typed.

        This is used to determine if the metadata model is a dictionary-like model,
        or a more strictly typed model.
        """
        if set(["extra"]) != set(cls.model_fields.keys()):
            return True

        return False

    def __repr__(self):
        """
        Provide a string representation of the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __repr__ method.
        """
        if self._is_field_typed():
            return super().__repr__()

        # Otherwise, we are deailing with dictornary-like metadata
        return json.dumps(self.extra)

    def __getitem__(self, name):
        """
        Provide dictionary functionlaity to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __getitem__ method.
        """
        if not self._is_field_typed():
            return self.extra[name]

        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

    def __setitem__(self, name, value):
        """
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __setitem__ method.
        """
        if not self._is_field_typed():
            self.extra[name] = value
        else:
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"
            )

    def __delitem__(self, name):
        """
        Provide dictionary functionality to the metadata class.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __delitem__ method.
        """
        if not self._is_field_typed():
            del self.extra[name]
        else:
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"
            )

    def __iter__(self):
        """
        Iterate over the keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have an __iter__ method.
        """
        if self._is_field_typed():
            raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

        return iter(self.extra)

    def __len__(self):
        """
        Get the number of keys in the metadata.

        This only works for the base metadata model. If sub-classed, this will raise an error,
        unless overridden, as BaseModel's do not have a __len__ method.
        """
        if self._is_field_typed():
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '__len__'"
            )

        return len(self.extra)

    def __getattr__(self, name):
        """
        Allow for getting of attributes on the metadata class.

        The attributes are retrieved through the following heirarchy:
            - If the model is sub-classed, it will be retrieved as normal.
            - Otherwise, if the attribute is private, it will be retrieved as normal.
            - Finally, if we are getting a public attribute on the base metadata class,
                we use the extra field.
            - If the key is not set in the `extra` dictionary, we resort back to just
            trying to get the field.
                - This is when we grab the `owner` or `task_result` attribuite.
        """
        if self._is_field_typed():
            return super().__getattr__(name)

        if name.startswith("_"):
            return super().__getattr__(name)

        # Attempt to retreieve the attr from the `extra` field
        try:
            return self.extra.get(name)

        except KeyError:
            # This is for grabbing properties on the base metadata class
            return super().__getattr__(name)

    def __setattr__(self, name: str, value: Any) -> None:
        """
        Allow for setting of attributes on the metadata class.

        The attributes are set through the following heirarchy:
            - If the model is sub-classed, it will be set as normal.
            - Otherwise, if the attribute is private, it will be set as normal.
            - Finally, if we are setting a public attribute on the base metadata class,
                we use the extra field.
        """
        if self._is_field_typed():
            return super().__setattr__(name, value)

        # We want to avoid setting any private attributes in the extra
        # dictionary
        if name.startswith("_"):
            return super().__setattr__(name, value)

        # If it is `owner` or `task_results`, we want
        # to avoid setting the attribute in the `extra` dictionary
        if name in ["owner", "task_results"]:
            return super().__setattr__(name, value)

        self.extra[name] = value

    def __delattr__(self, name: str) -> None:
        """
        Ensure that we can delete attributes from the metadata class.

        The attributes are deleted through the following heirarchy:
            - If the attribute is `task_results`, we use the descriptor to delete the task results.
            - Otherwise, if it is a sub-classed model, it will be deleted as normal.
            - Finally, if we are deleting a public attribute on the base metadata class,
                we use the extra field.
        """

        # We want to use the descriptor to delete the task results
        if name == "task_results":
            self._task_results.__delete__(self)
            return

        # Otherwise, we use our standard fallback tiers
        if self._is_field_typed():
            return super().__delattr__(name)

        del self.extra[name]

`owner: TMetadataOwner` `property` `writable`

Return the owner of the metadata.

`task_results: TaskResultsDescriptor` `property` `writable`

Return the task results descriptor.

`delattr(name)`

Ensure that we can delete attributes from the metadata class.

The attributes are deleted through the following heirarchy

If the attribute is task_results, we use the descriptor to delete the task results.
Otherwise, if it is a sub-classed model, it will be deleted as normal.
Finally, if we are deleting a public attribute on the base metadata class, we use the extra field.

Source code in docprompt/schema/pipeline/metadata.py

def __delattr__(self, name: str) -> None:
    """
    Ensure that we can delete attributes from the metadata class.

    The attributes are deleted through the following heirarchy:
        - If the attribute is `task_results`, we use the descriptor to delete the task results.
        - Otherwise, if it is a sub-classed model, it will be deleted as normal.
        - Finally, if we are deleting a public attribute on the base metadata class,
            we use the extra field.
    """

    # We want to use the descriptor to delete the task results
    if name == "task_results":
        self._task_results.__delete__(self)
        return

    # Otherwise, we use our standard fallback tiers
    if self._is_field_typed():
        return super().__delattr__(name)

    del self.extra[name]

`delitem(name)`

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an delitem method.

Source code in docprompt/schema/pipeline/metadata.py

def __delitem__(self, name):
    """
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __delitem__ method.
    """
    if not self._is_field_typed():
        del self.extra[name]
    else:
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

`getattr(name)`

Allow for getting of attributes on the metadata class.

The attributes are retrieved through the following heirarchy

If the model is sub-classed, it will be retrieved as normal.
Otherwise, if the attribute is private, it will be retrieved as normal.
Finally, if we are getting a public attribute on the base metadata class, we use the extra field.
If the key is not set in the extra dictionary, we resort back to just trying to get the field.
- This is when we grab the owner or task_result attribuite.

Source code in docprompt/schema/pipeline/metadata.py

def __getattr__(self, name):
    """
    Allow for getting of attributes on the metadata class.

    The attributes are retrieved through the following heirarchy:
        - If the model is sub-classed, it will be retrieved as normal.
        - Otherwise, if the attribute is private, it will be retrieved as normal.
        - Finally, if we are getting a public attribute on the base metadata class,
            we use the extra field.
        - If the key is not set in the `extra` dictionary, we resort back to just
        trying to get the field.
            - This is when we grab the `owner` or `task_result` attribuite.
    """
    if self._is_field_typed():
        return super().__getattr__(name)

    if name.startswith("_"):
        return super().__getattr__(name)

    # Attempt to retreieve the attr from the `extra` field
    try:
        return self.extra.get(name)

    except KeyError:
        # This is for grabbing properties on the base metadata class
        return super().__getattr__(name)

`getitem(name)`

Provide dictionary functionlaity to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an getitem method.

Source code in docprompt/schema/pipeline/metadata.py

def __getitem__(self, name):
    """
    Provide dictionary functionlaity to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __getitem__ method.
    """
    if not self._is_field_typed():
        return self.extra[name]

    raise AttributeError(
        f"'{self.__class__.__name__}' object has no attribute '{name}'"
    )

`iter()`

Iterate over the keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an iter method.

Source code in docprompt/schema/pipeline/metadata.py

def __iter__(self):
    """
    Iterate over the keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __iter__ method.
    """
    if self._is_field_typed():
        raise AttributeError(f"'{self.__class__.__name__}' object is not iterable")

    return iter(self.extra)

`len()`

Get the number of keys in the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a len method.

Source code in docprompt/schema/pipeline/metadata.py

def __len__(self):
    """
    Get the number of keys in the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __len__ method.
    """
    if self._is_field_typed():
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '__len__'"
        )

    return len(self.extra)

`repr()`

Provide a string representation of the metadata.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have a repr method.

Source code in docprompt/schema/pipeline/metadata.py

def __repr__(self):
    """
    Provide a string representation of the metadata.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have a __repr__ method.
    """
    if self._is_field_typed():
        return super().__repr__()

    # Otherwise, we are deailing with dictornary-like metadata
    return json.dumps(self.extra)

`setattr(name, value)`

Allow for setting of attributes on the metadata class.

The attributes are set through the following heirarchy

If the model is sub-classed, it will be set as normal.
Otherwise, if the attribute is private, it will be set as normal.
Finally, if we are setting a public attribute on the base metadata class, we use the extra field.

Source code in docprompt/schema/pipeline/metadata.py

def __setattr__(self, name: str, value: Any) -> None:
    """
    Allow for setting of attributes on the metadata class.

    The attributes are set through the following heirarchy:
        - If the model is sub-classed, it will be set as normal.
        - Otherwise, if the attribute is private, it will be set as normal.
        - Finally, if we are setting a public attribute on the base metadata class,
            we use the extra field.
    """
    if self._is_field_typed():
        return super().__setattr__(name, value)

    # We want to avoid setting any private attributes in the extra
    # dictionary
    if name.startswith("_"):
        return super().__setattr__(name, value)

    # If it is `owner` or `task_results`, we want
    # to avoid setting the attribute in the `extra` dictionary
    if name in ["owner", "task_results"]:
        return super().__setattr__(name, value)

    self.extra[name] = value

`setitem(name, value)`

Provide dictionary functionality to the metadata class.

This only works for the base metadata model. If sub-classed, this will raise an error, unless overridden, as BaseModel's do not have an setitem method.

Source code in docprompt/schema/pipeline/metadata.py

def __setitem__(self, name, value):
    """
    Provide dictionary functionality to the metadata class.

    This only works for the base metadata model. If sub-classed, this will raise an error,
    unless overridden, as BaseModel's do not have an __setitem__ method.
    """
    if not self._is_field_typed():
        self.extra[name] = value
    else:
        raise AttributeError(
            f"'{self.__class__.__name__}' object has no attribute '{name}'"
        )

`from_owner(owner, **data)` `classmethod`

Create a new instance of the metadata class with the owner set.

Source code in docprompt/schema/pipeline/metadata.py

@classmethod
def from_owner(cls, owner: TMetadataOwner, **data) -> BaseMetadata:
    """Create a new instance of the metadata class with the owner set."""
    metadata = cls(**data)
    metadata.owner = owner
    return metadata

`validate_data_fields_from_annotations(data)` `classmethod`

Validate the data fields from the annotations.

Source code in docprompt/schema/pipeline/metadata.py

@model_validator(mode="before")
@classmethod
def validate_data_fields_from_annotations(cls, data: Any) -> Any:
    """Validate the data fields from the annotations."""

    # We want to make sure that we combine the `extra` metdata along with any
    # other specific fields that are defined in the metadata.
    extra = data.pop("extra", {})
    assert isinstance(extra, dict), "The `extra` field must be a dictionary."
    data = {**data, **extra}

    # If the model has been sub-classed, then all of our fields must be
    # validated by the pydantic model.
    if cls._is_field_typed():
        # We will get the fields out of extra and set them as potential fields to
        # validate. They will be ignored if they are not defined in the model, but it
        # allows for a more flexible way to define metadata.
        # Otherwise, what ever is in the `extra` field will be stroed in the `extra` field.
        return {**data, "extra": extra}

    # Otherwise, we are using our mock-dict implentation, so we store our
    # metadata in the `extra` field.
    return {"extra": data}

`node`

`DocumentCollection`

Bases: BaseModel, Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata]

Represents a collection of documents with some common metadata

Source code in docprompt/schema/pipeline/node/collection.py

class DocumentCollection(
    BaseModel,
    Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata],
):
    """
    Represents a collection of documents with some common metadata
    """

    document_nodes: List["DocumentNode[DocumentNodeMetadata, PageNodeMetadata]"]
    metadata: DocumentCollectionMetadata = Field(..., default_factory=dict)

`DocumentNode`

Bases: BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]

Represents a single document, with some metadata

Source code in docprompt/schema/pipeline/node/document.py

class DocumentNode(BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]):
    """
    Represents a single document, with some metadata
    """

    document: PdfDocument
    page_nodes: List[PageNode[PageNodeMetadata]] = Field(
        description="The pages in the document", default_factory=list, repr=False
    )
    metadata: DocumentNodeMetadata = Field(
        description="Application-specific metadata for the document",
        default_factory=BaseMetadata,
    )

    _locator: Optional["DocumentProvenanceLocator"] = PrivateAttr(default=None)

    _persistance_path: Optional[str] = PrivateAttr(default=None)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_locator"] = None

        return state

    def __len__(self):
        return len(self.page_nodes)

    def __getitem__(self, index):
        return self.page_nodes[index]

    def __iter__(self):
        return iter(self.page_nodes)

    @property
    def rasterizer(self):
        return DocumentRasterizer(self)

    @property
    def locator(self):
        if self._locator is None:
            self.refresh_locator()

        return self._locator

    def refresh_locator(self):
        """
        Refreshes the locator for this document node
        """
        from docprompt.provenance.search import DocumentProvenanceLocator

        if any(not page.ocr_results.result for page in self.page_nodes):
            raise ValueError(
                "Cannot create a locator for a document node with missing OCR results"
            )

        self._locator = DocumentProvenanceLocator.from_document_node(self)

        return self.locator

    @classmethod
    def from_document(
        cls,
        document: PdfDocument,
        document_metadata: Optional[DocumentNodeMetadata] = None,
        page_metadata: Optional[List[PageNodeMetadata]] = None,
    ):
        document_node: "DocumentNode[DocumentNodeMetadata, PageNodeMetadata]" = cls(
            document=document,
        )
        document_node.metadata = document_metadata or cls.metadata_class().from_owner(
            document_node, **{}
        )

        if page_metadata is not None and len(page_metadata) != len(document):
            raise ValueError(
                "The number of page metadata items must match the number of pages in the document."
            )

        for page_number in range(1, len(document) + 1):
            if page_metadata is not None:
                page_node = PageNode(
                    document=document_node,
                    page_number=page_number,
                    metadata=page_metadata[page_number - 1],
                )
            else:
                page_node = PageNode(document=document_node, page_number=page_number)

            document_node.page_nodes.append(page_node)

        return document_node

    @property
    def file_hash(self):
        return self.document.document_hash

    @property
    def document_name(self):
        return self.document.name

    @classmethod
    def metadata_class(cls) -> Type[BaseMetadata]:
        """Get the metadata class for instantiating metadata from the model."""

        fields = cls.model_fields

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        metadata_field_annotation = fields["metadata"].annotation

        # If no override has been provided to the metadata model, we want to retrieve
        # it as a TypedDict
        if metadata_field_annotation == DocumentNodeMetadata:
            return BaseMetadata

        if isinstance(metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
            )

        # Get the overriden Generic type of th DocumentNodeMetadata
        return metadata_field_annotation

    @classmethod
    def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
        """Get the metadata class for the page nodes in the document."""
        fields = cls.model_fields

        # NOTE: The indexing is important here, and it allows us to get the type of each
        # page node in the `List` annotation
        page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
            "metadata"
        ].annotation

        if page_node_metadata_field_annotation == PageNodeMetadata:
            return BaseMetadata

        if isinstance(page_node_metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define PageNode with a ForwardRef for Generic metadata model types."
            )

        return page_node_metadata_field_annotation

    @property
    def persistance_path(self):
        """The base path to storage location."""
        return self._persistance_path

    @persistance_path.setter
    def persistance_path(self, path: str):
        """Set the base path to storage location."""
        self._persistance_path = path

    @classmethod
    def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
        """Load the document node from storage.

        Args:
            path (str): The base path to storage location.
                - Example (S3): "s3://bucket-name/key/to/folder"
                - Example (Local FS): "/tmp/docprompt/storage"
            file_hash (str): The hash of the document.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            DocumentNode: The loaded document node.
        """

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
            file_hash, **kwargs
        )

        doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
        node = cls.from_document(doc)

        if metadata_bytes:
            metadata_json = json.loads(metadata_bytes.decode("utf-8"))
            metadata = cls.metadata_class().from_owner(node, **metadata_json)
        else:
            metadata = cls.metadata_class().from_owner(node, **{})

        if page_metadata_bytes:
            page_metadata_json = [
                json.loads(page_str)
                for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
            ]
            page_metadata = [
                cls.page_metadata_class()(**page) for page in page_metadata_json
            ]
        else:
            page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

        # Store the metadata on the node and page nodes
        node.metadata = metadata
        for page, meta in zip(node.page_nodes, page_metadata):
            meta.set_owner(page)
            page.metadata = meta

        # Make sure to set the persistance path on the node
        node.persistance_path = path

        return node

    def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
        """Persist a document node to storage.

        Args:
            path (Optional[str]): Overwrites the current `persistance_path` property
                - If `persistance_path` is not currently set, path must be provided.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            FileSidecarsPathManager: The file path manager for the persisted document node.
        """

        path = path or self.persistance_path

        if path is None:
            raise ValueError("The path must be provided to persist the document node.")

        # Make sure to update the persistance path
        self.persistance_path = path

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes = self.document.get_bytes()
        metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
        page_metadata_bytes = bytes(
            json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
            encoding="utf-8",
        )

        return fs_manager.write(
            pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
        )

`persistance_path` `property` `writable`

The base path to storage location.

`from_storage(path, file_hash, **kwargs)` `classmethod`

Load the document node from storage.

Parameters:

Name	Type	Description	Default
`path`	`str`	The base path to storage location. - Example (S3): "s3://bucket-name/key/to/folder" - Example (Local FS): "/tmp/docprompt/storage"	required
`file_hash`	`str`	The hash of the document.	required
`**kwargs`		Additional keyword arguments for fsspec FileSystem	`{}`

Returns:

Name	Type	Description
`DocumentNode`	`Self`	The loaded document node.

Source code in docprompt/schema/pipeline/node/document.py

@classmethod
def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
    """Load the document node from storage.

    Args:
        path (str): The base path to storage location.
            - Example (S3): "s3://bucket-name/key/to/folder"
            - Example (Local FS): "/tmp/docprompt/storage"
        file_hash (str): The hash of the document.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        DocumentNode: The loaded document node.
    """

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
        file_hash, **kwargs
    )

    doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
    node = cls.from_document(doc)

    if metadata_bytes:
        metadata_json = json.loads(metadata_bytes.decode("utf-8"))
        metadata = cls.metadata_class().from_owner(node, **metadata_json)
    else:
        metadata = cls.metadata_class().from_owner(node, **{})

    if page_metadata_bytes:
        page_metadata_json = [
            json.loads(page_str)
            for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
        ]
        page_metadata = [
            cls.page_metadata_class()(**page) for page in page_metadata_json
        ]
    else:
        page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

    # Store the metadata on the node and page nodes
    node.metadata = metadata
    for page, meta in zip(node.page_nodes, page_metadata):
        meta.set_owner(page)
        page.metadata = meta

    # Make sure to set the persistance path on the node
    node.persistance_path = path

    return node

`metadata_class()` `classmethod`

Get the metadata class for instantiating metadata from the model.

Source code in docprompt/schema/pipeline/node/document.py

@classmethod
def metadata_class(cls) -> Type[BaseMetadata]:
    """Get the metadata class for instantiating metadata from the model."""

    fields = cls.model_fields

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    metadata_field_annotation = fields["metadata"].annotation

    # If no override has been provided to the metadata model, we want to retrieve
    # it as a TypedDict
    if metadata_field_annotation == DocumentNodeMetadata:
        return BaseMetadata

    if isinstance(metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
        )

    # Get the overriden Generic type of th DocumentNodeMetadata
    return metadata_field_annotation

`page_metadata_class()` `classmethod`

Get the metadata class for the page nodes in the document.

Source code in docprompt/schema/pipeline/node/document.py

@classmethod
def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
    """Get the metadata class for the page nodes in the document."""
    fields = cls.model_fields

    # NOTE: The indexing is important here, and it allows us to get the type of each
    # page node in the `List` annotation
    page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
        "metadata"
    ].annotation

    if page_node_metadata_field_annotation == PageNodeMetadata:
        return BaseMetadata

    if isinstance(page_node_metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define PageNode with a ForwardRef for Generic metadata model types."
        )

    return page_node_metadata_field_annotation

`persist(path=None, **kwargs)`

Persist a document node to storage.

Parameters:

Name	Type	Description	Default
`path`	`Optional[str]`	Overwrites the current `persistance_path` property - If `persistance_path` is not currently set, path must be provided.	`None`
`**kwargs`		Additional keyword arguments for fsspec FileSystem	`{}`

Returns:

Name	Type	Description
`FileSidecarsPathManager`	`FileSidecarsPathManager`	The file path manager for the persisted document node.

Source code in docprompt/schema/pipeline/node/document.py

def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
    """Persist a document node to storage.

    Args:
        path (Optional[str]): Overwrites the current `persistance_path` property
            - If `persistance_path` is not currently set, path must be provided.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        FileSidecarsPathManager: The file path manager for the persisted document node.
    """

    path = path or self.persistance_path

    if path is None:
        raise ValueError("The path must be provided to persist the document node.")

    # Make sure to update the persistance path
    self.persistance_path = path

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes = self.document.get_bytes()
    metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
    page_metadata_bytes = bytes(
        json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
        encoding="utf-8",
    )

    return fs_manager.write(
        pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
    )

`refresh_locator()`

Refreshes the locator for this document node

Source code in docprompt/schema/pipeline/node/document.py

def refresh_locator(self):
    """
    Refreshes the locator for this document node
    """
    from docprompt.provenance.search import DocumentProvenanceLocator

    if any(not page.ocr_results.result for page in self.page_nodes):
        raise ValueError(
            "Cannot create a locator for a document node with missing OCR results"
        )

    self._locator = DocumentProvenanceLocator.from_document_node(self)

    return self.locator

`PageNode`

Bases: BaseNode, Generic[PageNodeMetadata]

Represents a single page in a document, with some metadata

Source code in docprompt/schema/pipeline/node/page.py

class PageNode(BaseNode, Generic[PageNodeMetadata]):
    """
    Represents a single page in a document, with some metadata
    """

    document: "DocumentNode" = Field(exclude=True, repr=False)
    page_number: PositiveInt = Field(description="The page number")
    metadata: PageNodeMetadata = Field(
        description="Application-specific metadata for the page",
        default_factory=BaseMetadata,
    )
    extra: Dict[str, Any] = Field(
        description="Extra data that can be stored on the page node",
        default_factory=dict,
    )

    ocr_results: ResultContainer[OcrPageResult] = Field(
        default_factory=_result_container_factory,
        description="The OCR results for the page",
        repr=False,
    )

    _raster_cache: Dict[str, bytes] = PrivateAttr(default_factory=dict)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_raster_cache"] = {}

        return state

    @property
    def rasterizer(self):
        return PageRasterizer(self._raster_cache, self)

    def search(
        self, query: str, refine_to_words: bool = True, require_exact_match: bool = True
    ):
        return self.document.locator.search(
            query,
            page_number=self.page_number,
            refine_to_word=refine_to_words,
            require_exact_match=require_exact_match,
        )

`base`

`BaseNode`

Bases: BaseModel

The base node class is utilized for defining a basic yet flexible interface

Source code in docprompt/schema/pipeline/node/base.py

class BaseNode(BaseModel):
    """The base node class is utilized for defining a basic yet flexible interface"""

`collection`

`DocumentCollection`

Bases: BaseModel, Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata]

Represents a collection of documents with some common metadata

Source code in docprompt/schema/pipeline/node/collection.py

class DocumentCollection(
    BaseModel,
    Generic[DocumentCollectionMetadata, DocumentNodeMetadata, PageNodeMetadata],
):
    """
    Represents a collection of documents with some common metadata
    """

    document_nodes: List["DocumentNode[DocumentNodeMetadata, PageNodeMetadata]"]
    metadata: DocumentCollectionMetadata = Field(..., default_factory=dict)

`document`

`DocumentNode`

Bases: BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]

Represents a single document, with some metadata

Source code in docprompt/schema/pipeline/node/document.py

class DocumentNode(BaseNode, Generic[DocumentNodeMetadata, PageNodeMetadata]):
    """
    Represents a single document, with some metadata
    """

    document: PdfDocument
    page_nodes: List[PageNode[PageNodeMetadata]] = Field(
        description="The pages in the document", default_factory=list, repr=False
    )
    metadata: DocumentNodeMetadata = Field(
        description="Application-specific metadata for the document",
        default_factory=BaseMetadata,
    )

    _locator: Optional["DocumentProvenanceLocator"] = PrivateAttr(default=None)

    _persistance_path: Optional[str] = PrivateAttr(default=None)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_locator"] = None

        return state

    def __len__(self):
        return len(self.page_nodes)

    def __getitem__(self, index):
        return self.page_nodes[index]

    def __iter__(self):
        return iter(self.page_nodes)

    @property
    def rasterizer(self):
        return DocumentRasterizer(self)

    @property
    def locator(self):
        if self._locator is None:
            self.refresh_locator()

        return self._locator

    def refresh_locator(self):
        """
        Refreshes the locator for this document node
        """
        from docprompt.provenance.search import DocumentProvenanceLocator

        if any(not page.ocr_results.result for page in self.page_nodes):
            raise ValueError(
                "Cannot create a locator for a document node with missing OCR results"
            )

        self._locator = DocumentProvenanceLocator.from_document_node(self)

        return self.locator

    @classmethod
    def from_document(
        cls,
        document: PdfDocument,
        document_metadata: Optional[DocumentNodeMetadata] = None,
        page_metadata: Optional[List[PageNodeMetadata]] = None,
    ):
        document_node: "DocumentNode[DocumentNodeMetadata, PageNodeMetadata]" = cls(
            document=document,
        )
        document_node.metadata = document_metadata or cls.metadata_class().from_owner(
            document_node, **{}
        )

        if page_metadata is not None and len(page_metadata) != len(document):
            raise ValueError(
                "The number of page metadata items must match the number of pages in the document."
            )

        for page_number in range(1, len(document) + 1):
            if page_metadata is not None:
                page_node = PageNode(
                    document=document_node,
                    page_number=page_number,
                    metadata=page_metadata[page_number - 1],
                )
            else:
                page_node = PageNode(document=document_node, page_number=page_number)

            document_node.page_nodes.append(page_node)

        return document_node

    @property
    def file_hash(self):
        return self.document.document_hash

    @property
    def document_name(self):
        return self.document.name

    @classmethod
    def metadata_class(cls) -> Type[BaseMetadata]:
        """Get the metadata class for instantiating metadata from the model."""

        fields = cls.model_fields

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        metadata_field_annotation = fields["metadata"].annotation

        # If no override has been provided to the metadata model, we want to retrieve
        # it as a TypedDict
        if metadata_field_annotation == DocumentNodeMetadata:
            return BaseMetadata

        if isinstance(metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
            )

        # Get the overriden Generic type of th DocumentNodeMetadata
        return metadata_field_annotation

    @classmethod
    def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
        """Get the metadata class for the page nodes in the document."""
        fields = cls.model_fields

        # NOTE: The indexing is important here, and it allows us to get the type of each
        # page node in the `List` annotation
        page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

        # NOTE: The indexing is important here, and relies on the generic type being
        # the SECOND of the two arguments in the `Union` annotation
        page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
            "metadata"
        ].annotation

        if page_node_metadata_field_annotation == PageNodeMetadata:
            return BaseMetadata

        if isinstance(page_node_metadata_field_annotation, ForwardRef):
            raise ValueError(
                "You cannot define PageNode with a ForwardRef for Generic metadata model types."
            )

        return page_node_metadata_field_annotation

    @property
    def persistance_path(self):
        """The base path to storage location."""
        return self._persistance_path

    @persistance_path.setter
    def persistance_path(self, path: str):
        """Set the base path to storage location."""
        self._persistance_path = path

    @classmethod
    def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
        """Load the document node from storage.

        Args:
            path (str): The base path to storage location.
                - Example (S3): "s3://bucket-name/key/to/folder"
                - Example (Local FS): "/tmp/docprompt/storage"
            file_hash (str): The hash of the document.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            DocumentNode: The loaded document node.
        """

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
            file_hash, **kwargs
        )

        doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
        node = cls.from_document(doc)

        if metadata_bytes:
            metadata_json = json.loads(metadata_bytes.decode("utf-8"))
            metadata = cls.metadata_class().from_owner(node, **metadata_json)
        else:
            metadata = cls.metadata_class().from_owner(node, **{})

        if page_metadata_bytes:
            page_metadata_json = [
                json.loads(page_str)
                for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
            ]
            page_metadata = [
                cls.page_metadata_class()(**page) for page in page_metadata_json
            ]
        else:
            page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

        # Store the metadata on the node and page nodes
        node.metadata = metadata
        for page, meta in zip(node.page_nodes, page_metadata):
            meta.set_owner(page)
            page.metadata = meta

        # Make sure to set the persistance path on the node
        node.persistance_path = path

        return node

    def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
        """Persist a document node to storage.

        Args:
            path (Optional[str]): Overwrites the current `persistance_path` property
                - If `persistance_path` is not currently set, path must be provided.
            **kwargs: Additional keyword arguments for fsspec FileSystem

        Returns:
            FileSidecarsPathManager: The file path manager for the persisted document node.
        """

        path = path or self.persistance_path

        if path is None:
            raise ValueError("The path must be provided to persist the document node.")

        # Make sure to update the persistance path
        self.persistance_path = path

        fs_manager = FileSystemManager(path, **kwargs)

        pdf_bytes = self.document.get_bytes()
        metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
        page_metadata_bytes = bytes(
            json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
            encoding="utf-8",
        )

        return fs_manager.write(
            pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
        )

`persistance_path` `property` `writable`

The base path to storage location.

`from_storage(path, file_hash, **kwargs)` `classmethod`

Load the document node from storage.

Parameters:

Name	Type	Description	Default
`path`	`str`	The base path to storage location. - Example (S3): "s3://bucket-name/key/to/folder" - Example (Local FS): "/tmp/docprompt/storage"	required
`file_hash`	`str`	The hash of the document.	required
`**kwargs`		Additional keyword arguments for fsspec FileSystem	`{}`

Returns:

Name	Type	Description
`DocumentNode`	`Self`	The loaded document node.

Source code in docprompt/schema/pipeline/node/document.py

@classmethod
def from_storage(cls, path: str, file_hash: str, **kwargs) -> Self:
    """Load the document node from storage.

    Args:
        path (str): The base path to storage location.
            - Example (S3): "s3://bucket-name/key/to/folder"
            - Example (Local FS): "/tmp/docprompt/storage"
        file_hash (str): The hash of the document.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        DocumentNode: The loaded document node.
    """

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes, metadata_bytes, page_metadata_bytes = fs_manager.read(
        file_hash, **kwargs
    )

    doc = PdfDocument.from_bytes(pdf_bytes, name=fs_manager.get_pdf_name(file_hash))
    node = cls.from_document(doc)

    if metadata_bytes:
        metadata_json = json.loads(metadata_bytes.decode("utf-8"))
        metadata = cls.metadata_class().from_owner(node, **metadata_json)
    else:
        metadata = cls.metadata_class().from_owner(node, **{})

    if page_metadata_bytes:
        page_metadata_json = [
            json.loads(page_str)
            for page_str in json.loads(page_metadata_bytes.decode("utf-8"))
        ]
        page_metadata = [
            cls.page_metadata_class()(**page) for page in page_metadata_json
        ]
    else:
        page_metadata = [cls.page_metadata_class()(**{}) for _ in range(len(doc))]

    # Store the metadata on the node and page nodes
    node.metadata = metadata
    for page, meta in zip(node.page_nodes, page_metadata):
        meta.set_owner(page)
        page.metadata = meta

    # Make sure to set the persistance path on the node
    node.persistance_path = path

    return node

`metadata_class()` `classmethod`

Get the metadata class for instantiating metadata from the model.

Source code in docprompt/schema/pipeline/node/document.py

@classmethod
def metadata_class(cls) -> Type[BaseMetadata]:
    """Get the metadata class for instantiating metadata from the model."""

    fields = cls.model_fields

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    metadata_field_annotation = fields["metadata"].annotation

    # If no override has been provided to the metadata model, we want to retrieve
    # it as a TypedDict
    if metadata_field_annotation == DocumentNodeMetadata:
        return BaseMetadata

    if isinstance(metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define DocumentNode with a ForwardRef for Generic metadata model types."
        )

    # Get the overriden Generic type of th DocumentNodeMetadata
    return metadata_field_annotation

`page_metadata_class()` `classmethod`

Get the metadata class for the page nodes in the document.

Source code in docprompt/schema/pipeline/node/document.py

@classmethod
def page_metadata_class(cls) -> Type[Union[dict, BaseModel]]:
    """Get the metadata class for the page nodes in the document."""
    fields = cls.model_fields

    # NOTE: The indexing is important here, and it allows us to get the type of each
    # page node in the `List` annotation
    page_nodes_field_class = fields["page_nodes"].annotation.__args__[0]

    # NOTE: The indexing is important here, and relies on the generic type being
    # the SECOND of the two arguments in the `Union` annotation
    page_node_metadata_field_annotation = page_nodes_field_class.model_fields[
        "metadata"
    ].annotation

    if page_node_metadata_field_annotation == PageNodeMetadata:
        return BaseMetadata

    if isinstance(page_node_metadata_field_annotation, ForwardRef):
        raise ValueError(
            "You cannot define PageNode with a ForwardRef for Generic metadata model types."
        )

    return page_node_metadata_field_annotation

`persist(path=None, **kwargs)`

Persist a document node to storage.

Parameters:

Name	Type	Description	Default
`path`	`Optional[str]`	Overwrites the current `persistance_path` property - If `persistance_path` is not currently set, path must be provided.	`None`
`**kwargs`		Additional keyword arguments for fsspec FileSystem	`{}`

Returns:

Name	Type	Description
`FileSidecarsPathManager`	`FileSidecarsPathManager`	The file path manager for the persisted document node.

Source code in docprompt/schema/pipeline/node/document.py

def persist(self, path: Optional[str] = None, **kwargs) -> FileSidecarsPathManager:
    """Persist a document node to storage.

    Args:
        path (Optional[str]): Overwrites the current `persistance_path` property
            - If `persistance_path` is not currently set, path must be provided.
        **kwargs: Additional keyword arguments for fsspec FileSystem

    Returns:
        FileSidecarsPathManager: The file path manager for the persisted document node.
    """

    path = path or self.persistance_path

    if path is None:
        raise ValueError("The path must be provided to persist the document node.")

    # Make sure to update the persistance path
    self.persistance_path = path

    fs_manager = FileSystemManager(path, **kwargs)

    pdf_bytes = self.document.get_bytes()
    metadata_bytes = bytes(self.metadata.model_dump_json(), encoding="utf-8")
    page_metadata_bytes = bytes(
        json.dumps([page.metadata.model_dump_json() for page in self.page_nodes]),
        encoding="utf-8",
    )

    return fs_manager.write(
        pdf_bytes, metadata_bytes, page_metadata_bytes, **kwargs
    )

`refresh_locator()`

Refreshes the locator for this document node

Source code in docprompt/schema/pipeline/node/document.py

def refresh_locator(self):
    """
    Refreshes the locator for this document node
    """
    from docprompt.provenance.search import DocumentProvenanceLocator

    if any(not page.ocr_results.result for page in self.page_nodes):
        raise ValueError(
            "Cannot create a locator for a document node with missing OCR results"
        )

    self._locator = DocumentProvenanceLocator.from_document_node(self)

    return self.locator

`page`

`PageNode`

Bases: BaseNode, Generic[PageNodeMetadata]

Represents a single page in a document, with some metadata

Source code in docprompt/schema/pipeline/node/page.py

class PageNode(BaseNode, Generic[PageNodeMetadata]):
    """
    Represents a single page in a document, with some metadata
    """

    document: "DocumentNode" = Field(exclude=True, repr=False)
    page_number: PositiveInt = Field(description="The page number")
    metadata: PageNodeMetadata = Field(
        description="Application-specific metadata for the page",
        default_factory=BaseMetadata,
    )
    extra: Dict[str, Any] = Field(
        description="Extra data that can be stored on the page node",
        default_factory=dict,
    )

    ocr_results: ResultContainer[OcrPageResult] = Field(
        default_factory=_result_container_factory,
        description="The OCR results for the page",
        repr=False,
    )

    _raster_cache: Dict[str, bytes] = PrivateAttr(default_factory=dict)

    def __getstate__(self):
        state = super().__getstate__()

        state["__pydantic_private__"]["_raster_cache"] = {}

        return state

    @property
    def rasterizer(self):
        return PageRasterizer(self._raster_cache, self)

    def search(
        self, query: str, refine_to_words: bool = True, require_exact_match: bool = True
    ):
        return self.document.locator.search(
            query,
            page_number=self.page_number,
            refine_to_word=refine_to_words,
            require_exact_match=require_exact_match,
        )

`typing`

`rasterizer`

`DocumentRasterizer`

Source code in docprompt/schema/pipeline/rasterizer.py

class DocumentRasterizer:
    def __init__(self, owner: "DocumentNode"):
        self.owner = owner

    def rasterize(
        self,
        name: str,
        *,
        return_mode: Literal["bytes", "pil"] = "bytes",
        dpi: int = 100,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        max_file_size_bytes: Optional[int] = None,
        render_grayscale: bool = False,
    ) -> List[Union[bytes, Image.Image]]:
        images = self.owner.document.rasterize_pdf(
            dpi=dpi,
            downscale_size=downscale_size,
            resize_mode=resize_mode,
            resize_aspect_ratios=resize_aspect_ratios,
            do_convert=do_convert,
            image_convert_mode=image_convert_mode,
            do_quantize=do_quantize,
            quantize_color_count=quantize_color_count,
            max_file_size_bytes=max_file_size_bytes,
            render_grayscale=render_grayscale,
            return_mode=return_mode,
        )

        for page_number, image in images.items():
            page_node = self.owner.page_nodes[page_number - 1]

            page_node._raster_cache[name] = image

        return list(images.values())

    def propagate_cache(self, name: str, rasters: Dict[int, Union[bytes, Image.Image]]):
        """
        Should be one-indexed
        """
        for page_number, raster in rasters.items():
            page_node = self.owner.page_nodes[page_number - 1]

            page_node._raster_cache[name] = raster

`propagate_cache(name, rasters)`

Should be one-indexed

Source code in docprompt/schema/pipeline/rasterizer.py

def propagate_cache(self, name: str, rasters: Dict[int, Union[bytes, Image.Image]]):
    """
    Should be one-indexed
    """
    for page_number, raster in rasters.items():
        page_node = self.owner.page_nodes[page_number - 1]

        page_node._raster_cache[name] = raster

Index

document

PdfDocument

as_tempfile(**kwargs)

get_page_render_size(page_number, dpi=DEFAULT_DPI)

rasterize_page(page_number, *, dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, return_mode='bytes')

rasterize_page_to_data_uri(page_number, *, dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, render_grayscale=False)

rasterize_pdf(dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, return_mode='bytes', render_grayscale=False)

split(start=None, stop=None)

to_compressed_bytes(compression_kwargs={})

write_to_path(path, **kwargs)

get_page_render_size_from_bytes(file_bytes, page_number, dpi=DEFAULT_DPI)

layout

BoundingPoly

NormBBox

combine(*bboxes) classmethod

from_bounding_poly(bounding_poly) classmethod

x_overlap(other)

y_overlap(other)

Point

TextBlock

pipeline

BaseMetadata

owner: TMetadataOwner property writable

task_results: TaskResultsDescriptor property writable

__delattr__(name)

__delitem__(name)

__getattr__(name)

__getitem__(name)

__iter__()

__len__()

__repr__()

__setattr__(name, value)

__setitem__(name, value)

from_owner(owner, **data) classmethod

validate_data_fields_from_annotations(data) classmethod

DocumentCollection

DocumentNode

persistance_path property writable

from_storage(path, file_hash, **kwargs) classmethod

metadata_class() classmethod

page_metadata_class() classmethod

persist(path=None, **kwargs)

refresh_locator()

PageNode

metadata

BaseMetadata

owner: TMetadataOwner property writable

task_results: TaskResultsDescriptor property writable

__delattr__(name)

__delitem__(name)

__getattr__(name)

__getitem__(name)

__iter__()

__len__()

__repr__()

__setattr__(name, value)

__setitem__(name, value)

from_owner(owner, **data) classmethod

validate_data_fields_from_annotations(data) classmethod

node

DocumentCollection

DocumentNode

persistance_path property writable

from_storage(path, file_hash, **kwargs) classmethod

metadata_class() classmethod

page_metadata_class() classmethod

persist(path=None, **kwargs)

refresh_locator()

PageNode

base

BaseNode

collection

DocumentCollection

document

DocumentNode

persistance_path property writable

from_storage(path, file_hash, **kwargs) classmethod

metadata_class() classmethod

page_metadata_class() classmethod

`document`

`PdfDocument`

`as_tempfile(**kwargs)`

`get_page_render_size(page_number, dpi=DEFAULT_DPI)`

`rasterize_page(page_number, *, dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, return_mode='bytes')`

`rasterize_page_to_data_uri(page_number, *, dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, render_grayscale=False)`

`rasterize_pdf(dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, return_mode='bytes', render_grayscale=False)`

`split(start=None, stop=None)`

`to_compressed_bytes(compression_kwargs={})`

`write_to_path(path, **kwargs)`

`get_page_render_size_from_bytes(file_bytes, page_number, dpi=DEFAULT_DPI)`

`layout`

`BoundingPoly`

`NormBBox`

`combine(*bboxes)` `classmethod`

`from_bounding_poly(bounding_poly)` `classmethod`

`x_overlap(other)`

`y_overlap(other)`

`Point`

`TextBlock`

`pipeline`

`BaseMetadata`

`owner: TMetadataOwner` `property` `writable`

`task_results: TaskResultsDescriptor` `property` `writable`

`delattr(name)`

`delitem(name)`

`getattr(name)`

`getitem(name)`

`iter()`

`len()`

`repr()`

`setattr(name, value)`

`setitem(name, value)`

`from_owner(owner, **data)` `classmethod`

`validate_data_fields_from_annotations(data)` `classmethod`

`DocumentCollection`

`DocumentNode`

`persistance_path` `property` `writable`

`from_storage(path, file_hash, **kwargs)` `classmethod`

`metadata_class()` `classmethod`

`page_metadata_class()` `classmethod`

`persist(path=None, **kwargs)`

`refresh_locator()`

`PageNode`

`metadata`

`BaseMetadata`

`owner: TMetadataOwner` `property` `writable`

`task_results: TaskResultsDescriptor` `property` `writable`

`delattr(name)`

`delitem(name)`

`getattr(name)`

`getitem(name)`

`iter()`

`len()`

`repr()`

`setattr(name, value)`

`setitem(name, value)`

`from_owner(owner, **data)` `classmethod`

`validate_data_fields_from_annotations(data)` `classmethod`

`node`

`DocumentCollection`

`DocumentNode`

`persistance_path` `property` `writable`

`from_storage(path, file_hash, **kwargs)` `classmethod`

`metadata_class()` `classmethod`

`page_metadata_class()` `classmethod`

`persist(path=None, **kwargs)`

`refresh_locator()`

`PageNode`

`base`

`BaseNode`

`collection`

`DocumentCollection`

`document`

`DocumentNode`

`persistance_path` `property` `writable`

`from_storage(path, file_hash, **kwargs)` `classmethod`

`metadata_class()` `classmethod`

`page_metadata_class()` `classmethod`

`persist(path=None, **kwargs)`