Attempts to determine the name of a PDF by exaimining metadata

Source code in docprompt/utils/
def determine_pdf_name_from_bytes(file_bytes: bytes) -> str:
    with get_pdfium_document(file_bytes) as pdf:
        metadata_dict = pdf.get_metadata_dict(skip_empty=True)

    name = None

    if metadata_dict:
        name = (
            or metadata_dict.get("Subject")
            or metadata_dict.get("Author")

    if name:
        return f"{name.strip()}.pdf"

    return f"document-{hash_from_bytes(file_bytes)}.pdf"


Determines the number of pages in a PDF

Source code in docprompt/utils/
def get_page_count(fd: Union[Path, PathLike, bytes]) -> int:
    if not isinstance(fd, bytes):
        with open(fd, "rb") as f:
            fd =

    with get_pdfium_document(fd) as pdf:
        return len(pdf)

hash_from_bytes(byte_data, hash_func=hashlib.md5, threshold=1024 * 1024 * 128)

Gets a hash from bytes. If the bytes are larger than the threshold, the hash is computed in chunks to avoid memory issues. The default hash function is MD5 with a threshold of 128MB which is optimal for most machines and use cases.

Source code in docprompt/utils/
def hash_from_bytes(
    byte_data: bytes, hash_func=hashlib.md5, threshold=1024 * 1024 * 128
) -> str:
    Gets a hash from bytes. If the bytes are larger than the threshold, the hash is computed in chunks
    to avoid memory issues. The default hash function is MD5 with a threshold of 128MB which is optimal
    for most machines and use cases.
    if len(byte_data) < 1024 * 1024 * 10:  # 10MB
        return hashlib.md5(byte_data).hexdigest()

    hash = hash_func()

    if len(byte_data) > threshold:
        stream = BytesIO(byte_data)
        b = bytearray(128 * 1024)
        mv = memoryview(b)

        while n := stream.readinto(mv):

    return hash.hexdigest()


Determines if a file is a PDF

Source code in docprompt/utils/
def is_pdf(fd: Union[Path, PathLike, bytes]) -> bool:
    Determines if a file is a PDF
    if isinstance(fd, (bytes, str)):
        mime = filetype.guess_mime(fd)
        with open(fd, "rb") as f:
            # We only need the first 1024 bytes to determine if it's a PDF
            mime = filetype.guess_mime(

    return mime == "application/pdf"

load_pdf_document(fp, *, file_name=None, password=None)

Loads a document from a file path

Source code in docprompt/utils/
def load_pdf_document(
    fp: Union[Path, PathLike, bytes],
    file_name: Optional[str] = None,
    password: Optional[str] = None,
) -> PdfDocument:
    if isinstance(fp, bytes):
        file_bytes = fp
        file_name = file_name or determine_pdf_name_from_bytes(file_bytes)
        file_name = name_from_path(fp) if file_name is None else file_name

        file_bytes = read_pdf_bytes_from_path(fp)

    if not is_pdf(file_bytes):
        raise ValueError("File is not a PDF")

    return PdfDocument(

load_pdf_documents(fps, *, max_threads=12, passwords=None)

Loads multiple documents from file paths, using a thread pool

Source code in docprompt/utils/
def load_pdf_documents(
    fps: List[Union[Path, PathLike, bytes]],
    max_threads: int = 12,
    passwords: Optional[List[str]] = None,
    Loads multiple documents from file paths, using a thread pool
    futures = []

    thread_count = min(max_threads, len(fps))

    with ThreadPoolExecutor(max_workers=thread_count) as executor:
        for fp in fps:
            futures.append(executor.submit(load_document, fp))

    results = []

    for future in as_completed(futures):

    return results