Skip to content

document

PdfDocument

Bases: BaseModel

Represents a PDF document

Source code in docprompt/schema/document.py
class PdfDocument(BaseModel):
    """
    Represents a PDF document
    """

    name: str = Field(description="The name of the document")
    file_bytes: bytes = Field(description="The bytes of the document", repr=False)
    file_path: Optional[str] = None

    password: Optional[SecretStr] = None

    def __len__(self):
        return self.num_pages

    def __hash__(self):
        return hash(self.document_hash)

    @computed_field
    @cached_property
    def page_count(self) -> PositiveInt:
        from docprompt.utils.util import get_page_count

        return get_page_count(self.file_bytes)

    @property
    def num_pages(self):
        return self.page_count

    @property
    def bytes_per_page(self):
        return len(self.file_bytes) / self.num_pages

    @computed_field
    @cached_property
    def document_hash(self) -> str:
        from docprompt.utils.util import hash_from_bytes

        return hash_from_bytes(self.file_bytes)

    @field_serializer("file_bytes")
    def serialize_file_bytes(self, v: bytes, _info):
        compressed = gzip.compress(v)

        return base64.b64encode(compressed).decode("utf-8")

    @field_validator("file_bytes")
    def validate_file_bytes(cls, v: bytes):
        if not isinstance(v, bytes):
            raise ValueError("File bytes must be bytes")

        if len(v) == 0:
            raise ValueError("File bytes must not be empty")

        if filetype.guess_mime(v) == "text/plain":
            v = base64.b64decode(v, validate=True)

        if filetype.guess_mime(v) == "application/gzip":
            v = gzip.decompress(v)

        if filetype.guess_mime(v) != "application/pdf":
            raise ValueError("File bytes must be a PDF")

        return v

    @classmethod
    def from_path(cls, file_path: Union[PathLike, str]):
        file_path = Path(file_path)

        if not file_path.is_file():
            raise ValueError(f"File path {file_path} is not a file")

        file_bytes = file_path.read_bytes()

        return cls(name=file_path.name, file_path=str(file_path), file_bytes=file_bytes)

    @classmethod
    def from_bytes(cls, file_bytes: bytes, name: Optional[str] = None):
        if name is None:
            name = f"PDF-{datetime.now().isoformat()}.pdf"

        return cls(name=name, file_bytes=file_bytes)

    def get_bytes(self) -> bytes:
        return self.file_bytes  # Deprecated

    @property
    def path(self):
        return self.file_path

    def get_page_render_size(
        self, page_number: int, dpi: int = DEFAULT_DPI
    ) -> Tuple[int, int]:
        """
        Returns the render size of a page in pixels
        """
        return get_page_render_size_from_bytes(self.get_bytes(), page_number, dpi=dpi)

    def to_compressed_bytes(self, compression_kwargs: dict = {}) -> bytes:
        """
        Compresses the document using Ghostscript
        """
        with self.as_tempfile() as temp_path:
            return compress_pdf_to_bytes(temp_path, **compression_kwargs)

    def rasterize_page(
        self,
        page_number: int,
        *,
        dpi: int = DEFAULT_DPI,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        max_file_size_bytes: Optional[int] = None,
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        return_mode: Literal["pil", "bytes"] = "bytes",
    ):
        """
        Rasterizes a page of the document using Pdfium
        """
        if page_number <= 0 or page_number > self.num_pages:
            raise ValueError(f"Page number must be between 0 and {self.num_pages}")

        post_process_fn = None

        if any(
            (
                downscale_size,
                max_file_size_bytes,
                resize_aspect_ratios,
                do_convert,
                do_quantize,
            )
        ):
            post_process_fn = partial(
                process_raster_image,
                resize_width=downscale_size[0] if downscale_size else None,
                resize_height=downscale_size[1] if downscale_size else None,
                resize_mode=resize_mode,
                resize_aspect_ratios=resize_aspect_ratios,
                do_convert=do_convert,
                image_convert_mode=image_convert_mode,
                do_quantize=do_quantize,
                quantize_color_count=quantize_color_count,
                max_file_size_bytes=max_file_size_bytes,
            )

        rastered = rasterize_page_with_pdfium(
            self.file_bytes,
            page_number,
            return_mode=return_mode,
            post_process_fn=post_process_fn,
            scale=(1 / 72) * dpi,
        )

        return rastered

    def rasterize_page_to_data_uri(
        self,
        page_number: int,
        *,
        dpi: int = DEFAULT_DPI,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        max_file_size_bytes: Optional[int] = None,
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        render_grayscale: bool = False,
    ) -> str:
        """
        Rasterizes a page of the document using Pdfium and returns a data URI, which can
        be embedded into HTML or passed to large language models
        """
        image_bytes = self.rasterize_page(
            page_number,
            dpi=dpi,
            downscale_size=downscale_size,
            do_convert=do_convert,
            image_convert_mode=image_convert_mode,
            do_quantize=do_quantize,
            quantize_color_count=quantize_color_count,
            resize_mode=resize_mode,
            max_file_size_bytes=max_file_size_bytes,
            resize_aspect_ratios=resize_aspect_ratios,
            return_mode="bytes",
        )
        return f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"

    def rasterize_pdf(
        self,
        dpi: int = DEFAULT_DPI,
        downscale_size: Optional[Tuple[int, int]] = None,
        resize_mode: ResizeModes = "thumbnail",
        max_file_size_bytes: Optional[int] = None,
        resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
        do_convert: bool = False,
        image_convert_mode: str = "L",
        do_quantize: bool = False,
        quantize_color_count: int = 8,
        return_mode: Literal["pil", "bytes"] = "bytes",
        render_grayscale: bool = False,
    ) -> Dict[int, bytes]:
        """
        Rasterizes the entire document using Pdfium
        """
        result = {}

        post_process_fn = None

        if any(
            (
                downscale_size,
                max_file_size_bytes,
                resize_aspect_ratios,
                do_convert,
                do_quantize,
            )
        ):
            post_process_fn = partial(
                process_raster_image,
                resize_width=downscale_size[0] if downscale_size else None,
                resize_height=downscale_size[1] if downscale_size else None,
                resize_mode=resize_mode,
                resize_aspect_ratios=resize_aspect_ratios,
                do_convert=do_convert,
                image_convert_mode=image_convert_mode,
                do_quantize=do_quantize,
                quantize_color_count=quantize_color_count,
                max_file_size_bytes=max_file_size_bytes,
            )

        for idx, rastered in enumerate(
            rasterize_pdf_with_pdfium(
                self.file_bytes,
                scale=(1 / 72) * dpi,
                grayscale=render_grayscale,
                return_mode=return_mode,
                post_process_fn=post_process_fn,
            )
        ):
            result[idx + 1] = rastered

        return result

    def split(self, start: Optional[int] = None, stop: Optional[int] = None):
        """
        Splits a document into multiple documents
        """
        if start is None and stop is None:
            raise ValueError("Must specify either start or stop")

        start = start or 0

        from docprompt.utils.splitter import split_pdf_to_bytes

        split_bytes = split_pdf_to_bytes(
            self.file_bytes, start_page=start, stop_page=stop
        )

        return Document.from_bytes(split_bytes, name=self.name)

    def as_tempfile(self, **kwargs):
        """
        Returns a tempfile of the document
        """

        @contextmanager
        def tempfile_context() -> Generator[str, None, None]:
            tempfile_kwargs = {"mode": "wb", "delete": True, "suffix": ".pdf", **kwargs}

            with tempfile.NamedTemporaryFile(**tempfile_kwargs) as f:
                f.write(self.file_bytes)
                f.flush()
                yield f.name

        return tempfile_context()

    def write_to_path(self, path: Union[PathLike, str], **kwargs):
        """
        Writes the document to a path
        """
        path = Path(path)

        if path.is_dir():
            path = path / self.name

        with path.open("wb") as f:
            f.write(self.file_bytes)

as_tempfile(**kwargs)

Returns a tempfile of the document

Source code in docprompt/schema/document.py
def as_tempfile(self, **kwargs):
    """
    Returns a tempfile of the document
    """

    @contextmanager
    def tempfile_context() -> Generator[str, None, None]:
        tempfile_kwargs = {"mode": "wb", "delete": True, "suffix": ".pdf", **kwargs}

        with tempfile.NamedTemporaryFile(**tempfile_kwargs) as f:
            f.write(self.file_bytes)
            f.flush()
            yield f.name

    return tempfile_context()

get_page_render_size(page_number, dpi=DEFAULT_DPI)

Returns the render size of a page in pixels

Source code in docprompt/schema/document.py
def get_page_render_size(
    self, page_number: int, dpi: int = DEFAULT_DPI
) -> Tuple[int, int]:
    """
    Returns the render size of a page in pixels
    """
    return get_page_render_size_from_bytes(self.get_bytes(), page_number, dpi=dpi)

rasterize_page(page_number, *, dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, return_mode='bytes')

Rasterizes a page of the document using Pdfium

Source code in docprompt/schema/document.py
def rasterize_page(
    self,
    page_number: int,
    *,
    dpi: int = DEFAULT_DPI,
    downscale_size: Optional[Tuple[int, int]] = None,
    resize_mode: ResizeModes = "thumbnail",
    max_file_size_bytes: Optional[int] = None,
    resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
    do_convert: bool = False,
    image_convert_mode: str = "L",
    do_quantize: bool = False,
    quantize_color_count: int = 8,
    return_mode: Literal["pil", "bytes"] = "bytes",
):
    """
    Rasterizes a page of the document using Pdfium
    """
    if page_number <= 0 or page_number > self.num_pages:
        raise ValueError(f"Page number must be between 0 and {self.num_pages}")

    post_process_fn = None

    if any(
        (
            downscale_size,
            max_file_size_bytes,
            resize_aspect_ratios,
            do_convert,
            do_quantize,
        )
    ):
        post_process_fn = partial(
            process_raster_image,
            resize_width=downscale_size[0] if downscale_size else None,
            resize_height=downscale_size[1] if downscale_size else None,
            resize_mode=resize_mode,
            resize_aspect_ratios=resize_aspect_ratios,
            do_convert=do_convert,
            image_convert_mode=image_convert_mode,
            do_quantize=do_quantize,
            quantize_color_count=quantize_color_count,
            max_file_size_bytes=max_file_size_bytes,
        )

    rastered = rasterize_page_with_pdfium(
        self.file_bytes,
        page_number,
        return_mode=return_mode,
        post_process_fn=post_process_fn,
        scale=(1 / 72) * dpi,
    )

    return rastered

rasterize_page_to_data_uri(page_number, *, dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, render_grayscale=False)

Rasterizes a page of the document using Pdfium and returns a data URI, which can be embedded into HTML or passed to large language models

Source code in docprompt/schema/document.py
def rasterize_page_to_data_uri(
    self,
    page_number: int,
    *,
    dpi: int = DEFAULT_DPI,
    downscale_size: Optional[Tuple[int, int]] = None,
    resize_mode: ResizeModes = "thumbnail",
    max_file_size_bytes: Optional[int] = None,
    resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
    do_convert: bool = False,
    image_convert_mode: str = "L",
    do_quantize: bool = False,
    quantize_color_count: int = 8,
    render_grayscale: bool = False,
) -> str:
    """
    Rasterizes a page of the document using Pdfium and returns a data URI, which can
    be embedded into HTML or passed to large language models
    """
    image_bytes = self.rasterize_page(
        page_number,
        dpi=dpi,
        downscale_size=downscale_size,
        do_convert=do_convert,
        image_convert_mode=image_convert_mode,
        do_quantize=do_quantize,
        quantize_color_count=quantize_color_count,
        resize_mode=resize_mode,
        max_file_size_bytes=max_file_size_bytes,
        resize_aspect_ratios=resize_aspect_ratios,
        return_mode="bytes",
    )
    return f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"

rasterize_pdf(dpi=DEFAULT_DPI, downscale_size=None, resize_mode='thumbnail', max_file_size_bytes=None, resize_aspect_ratios=None, do_convert=False, image_convert_mode='L', do_quantize=False, quantize_color_count=8, return_mode='bytes', render_grayscale=False)

Rasterizes the entire document using Pdfium

Source code in docprompt/schema/document.py
def rasterize_pdf(
    self,
    dpi: int = DEFAULT_DPI,
    downscale_size: Optional[Tuple[int, int]] = None,
    resize_mode: ResizeModes = "thumbnail",
    max_file_size_bytes: Optional[int] = None,
    resize_aspect_ratios: Optional[Iterable[AspectRatioRule]] = None,
    do_convert: bool = False,
    image_convert_mode: str = "L",
    do_quantize: bool = False,
    quantize_color_count: int = 8,
    return_mode: Literal["pil", "bytes"] = "bytes",
    render_grayscale: bool = False,
) -> Dict[int, bytes]:
    """
    Rasterizes the entire document using Pdfium
    """
    result = {}

    post_process_fn = None

    if any(
        (
            downscale_size,
            max_file_size_bytes,
            resize_aspect_ratios,
            do_convert,
            do_quantize,
        )
    ):
        post_process_fn = partial(
            process_raster_image,
            resize_width=downscale_size[0] if downscale_size else None,
            resize_height=downscale_size[1] if downscale_size else None,
            resize_mode=resize_mode,
            resize_aspect_ratios=resize_aspect_ratios,
            do_convert=do_convert,
            image_convert_mode=image_convert_mode,
            do_quantize=do_quantize,
            quantize_color_count=quantize_color_count,
            max_file_size_bytes=max_file_size_bytes,
        )

    for idx, rastered in enumerate(
        rasterize_pdf_with_pdfium(
            self.file_bytes,
            scale=(1 / 72) * dpi,
            grayscale=render_grayscale,
            return_mode=return_mode,
            post_process_fn=post_process_fn,
        )
    ):
        result[idx + 1] = rastered

    return result

split(start=None, stop=None)

Splits a document into multiple documents

Source code in docprompt/schema/document.py
def split(self, start: Optional[int] = None, stop: Optional[int] = None):
    """
    Splits a document into multiple documents
    """
    if start is None and stop is None:
        raise ValueError("Must specify either start or stop")

    start = start or 0

    from docprompt.utils.splitter import split_pdf_to_bytes

    split_bytes = split_pdf_to_bytes(
        self.file_bytes, start_page=start, stop_page=stop
    )

    return Document.from_bytes(split_bytes, name=self.name)

to_compressed_bytes(compression_kwargs={})

Compresses the document using Ghostscript

Source code in docprompt/schema/document.py
def to_compressed_bytes(self, compression_kwargs: dict = {}) -> bytes:
    """
    Compresses the document using Ghostscript
    """
    with self.as_tempfile() as temp_path:
        return compress_pdf_to_bytes(temp_path, **compression_kwargs)

write_to_path(path, **kwargs)

Writes the document to a path

Source code in docprompt/schema/document.py
def write_to_path(self, path: Union[PathLike, str], **kwargs):
    """
    Writes the document to a path
    """
    path = Path(path)

    if path.is_dir():
        path = path / self.name

    with path.open("wb") as f:
        f.write(self.file_bytes)

get_page_render_size_from_bytes(file_bytes, page_number, dpi=DEFAULT_DPI)

Returns the render size of a page in pixels

Source code in docprompt/schema/document.py
def get_page_render_size_from_bytes(
    file_bytes: bytes, page_number: int, dpi: int = DEFAULT_DPI
):
    """
    Returns the render size of a page in pixels
    """

    with get_pdfium_document(file_bytes) as pdf:
        page = pdf.get_page(page_number)

        mediabox = page.get_mediabox()

        base_width = int(mediabox[2] - mediabox[0])
        base_height = int(mediabox[3] - mediabox[1])

        width = int(base_width * dpi / 72)
        height = int(base_height * dpi / 72)

        return width, height