Skip to content

Index

extract_dates_from_text(input_string, *, date_formats=default_date_formats)

Extract dates from a string using a set of predefined regex patterns.

Returns a list of tuples, where the first element is the date object and the second is the full date string.

Source code in docprompt/utils/date_extraction.py
def extract_dates_from_text(
    input_string: str, *, date_formats: DateFormatsType = default_date_formats
) -> List[Tuple[date, str]]:
    """
    Extract dates from a string using a set of predefined regex patterns.

    Returns a list of tuples, where the first element is the date object and the second is the full date string.
    """
    extracted_dates = []

    for regex, date_format in date_formats:
        matches = regex.findall(input_string)

        for match_obj in matches:
            # Extract the full date from the match
            full_date = match_obj[0]  # First group captures the entire date

            if "%d" in date_format:
                parse_date = re.sub(r"(st|nd|rd|th)", "", full_date)
            else:
                parse_date = full_date

            parse_date = re.sub(r"\s+", " ", parse_date).strip()
            parse_date = re.sub(
                r"\s{1,},", ",", parse_date
            ).strip()  # Commas shouldnt have spaces before them

            # Convert to datetime object
            try:
                date_obj = datetime.strptime(parse_date, date_format)
            except ValueError as e:
                print(f"Error parsing date '{full_date}': {e}")
                continue

            extracted_dates.append((date_obj.date(), full_date))

    return extracted_dates

get_page_count(fd)

Determines the number of pages in a PDF

Source code in docprompt/utils/util.py
def get_page_count(fd: Union[Path, PathLike, bytes]) -> int:
    """
    Determines the number of pages in a PDF
    """
    if not isinstance(fd, bytes):
        with open(fd, "rb") as f:
            fd = f.read()

    with get_pdfium_document(fd) as pdf:
        return len(pdf)

hash_from_bytes(byte_data, hash_func=hashlib.md5, threshold=1024 * 1024 * 128)

Gets a hash from bytes. If the bytes are larger than the threshold, the hash is computed in chunks to avoid memory issues. The default hash function is MD5 with a threshold of 128MB which is optimal for most machines and use cases.

Source code in docprompt/utils/util.py
def hash_from_bytes(
    byte_data: bytes, hash_func=hashlib.md5, threshold=1024 * 1024 * 128
) -> str:
    """
    Gets a hash from bytes. If the bytes are larger than the threshold, the hash is computed in chunks
    to avoid memory issues. The default hash function is MD5 with a threshold of 128MB which is optimal
    for most machines and use cases.
    """
    if len(byte_data) < 1024 * 1024 * 10:  # 10MB
        return hashlib.md5(byte_data).hexdigest()

    hash = hash_func()

    if len(byte_data) > threshold:
        stream = BytesIO(byte_data)
        b = bytearray(128 * 1024)
        mv = memoryview(b)

        while n := stream.readinto(mv):
            hash.update(mv[:n])
    else:
        hash.update(byte_data)

    return hash.hexdigest()

is_pdf(fd)

Determines if a file is a PDF

Source code in docprompt/utils/util.py
def is_pdf(fd: Union[Path, PathLike, bytes]) -> bool:
    """
    Determines if a file is a PDF
    """
    if isinstance(fd, (bytes, str)):
        mime = filetype.guess_mime(fd)
    else:
        with open(fd, "rb") as f:
            # We only need the first 1024 bytes to determine if it's a PDF
            mime = filetype.guess_mime(f.read(1024))

    return mime == "application/pdf"

load_pdf_document(fp, *, file_name=None, password=None)

Loads a document from a file path

Source code in docprompt/utils/util.py
def load_pdf_document(
    fp: Union[Path, PathLike, bytes],
    *,
    file_name: Optional[str] = None,
    password: Optional[str] = None,
) -> PdfDocument:
    """
    Loads a document from a file path
    """
    if isinstance(fp, bytes):
        file_bytes = fp
        file_name = file_name or determine_pdf_name_from_bytes(file_bytes)
    else:
        file_name = name_from_path(fp) if file_name is None else file_name

        file_bytes = read_pdf_bytes_from_path(fp)

    if not is_pdf(file_bytes):
        raise ValueError("File is not a PDF")

    return PdfDocument(
        name=unquote(file_name),
        file_path=str(fp),
        file_bytes=file_bytes,
        password=password,
    )

load_pdf_documents(fps, *, max_threads=12, passwords=None)

Loads multiple documents from file paths, using a thread pool

Source code in docprompt/utils/util.py
def load_pdf_documents(
    fps: List[Union[Path, PathLike, bytes]],
    *,
    max_threads: int = 12,
    passwords: Optional[List[str]] = None,
):
    """
    Loads multiple documents from file paths, using a thread pool
    """
    futures = []

    thread_count = min(max_threads, len(fps))

    with ThreadPoolExecutor(max_workers=thread_count) as executor:
        for fp in fps:
            futures.append(executor.submit(load_document, fp))

    results = []

    for future in as_completed(futures):
        results.append(future.result())

    return results

date_extraction

extract_dates_from_text(input_string, *, date_formats=default_date_formats)

Extract dates from a string using a set of predefined regex patterns.

Returns a list of tuples, where the first element is the date object and the second is the full date string.

Source code in docprompt/utils/date_extraction.py
def extract_dates_from_text(
    input_string: str, *, date_formats: DateFormatsType = default_date_formats
) -> List[Tuple[date, str]]:
    """
    Extract dates from a string using a set of predefined regex patterns.

    Returns a list of tuples, where the first element is the date object and the second is the full date string.
    """
    extracted_dates = []

    for regex, date_format in date_formats:
        matches = regex.findall(input_string)

        for match_obj in matches:
            # Extract the full date from the match
            full_date = match_obj[0]  # First group captures the entire date

            if "%d" in date_format:
                parse_date = re.sub(r"(st|nd|rd|th)", "", full_date)
            else:
                parse_date = full_date

            parse_date = re.sub(r"\s+", " ", parse_date).strip()
            parse_date = re.sub(
                r"\s{1,},", ",", parse_date
            ).strip()  # Commas shouldnt have spaces before them

            # Convert to datetime object
            try:
                date_obj = datetime.strptime(parse_date, date_format)
            except ValueError as e:
                print(f"Error parsing date '{full_date}': {e}")
                continue

            extracted_dates.append((date_obj.date(), full_date))

    return extracted_dates

inference

A utility file for running inference with various LLM providers.

run_batch_inference_anthropic(model_name, messages, **kwargs) async

Run batch inference using an Anthropic model asynchronously.

Source code in docprompt/utils/inference.py
async def run_batch_inference_anthropic(
    model_name: str, messages: List[List[OpenAIMessage]], **kwargs
) -> List[str]:
    """Run batch inference using an Anthropic model asynchronously."""
    retry_decorator = get_anthropic_retry_decorator()

    @retry_decorator
    async def process_message_set(msg_set):
        return await run_inference_anthropic(model_name, msg_set, **kwargs)

    tasks = [process_message_set(msg_set) for msg_set in messages]

    responses: List[str] = []
    for f in tqdm(asyncio.as_completed(tasks), desc="Processing messages"):
        response = await f
        responses.append(response)

    return responses

run_inference_anthropic(model_name, messages, **kwargs) async

Run inference using an Anthropic model asynchronously.

Source code in docprompt/utils/inference.py
async def run_inference_anthropic(
    model_name: str, messages: List[OpenAIMessage], **kwargs
) -> str:
    """Run inference using an Anthropic model asynchronously."""
    from anthropic import AsyncAnthropic

    api_key = kwargs.pop("api_key", os.environ.get("ANTHROPIC_API_KEY"))
    client = AsyncAnthropic(api_key=api_key)

    system = None
    if messages and messages[0].role == "system":
        system = messages[0].content
        messages = messages[1:]

    processed_messages = []
    for msg in messages:
        if isinstance(msg.content, list):
            processed_content = []
            for content in msg.content:
                if isinstance(content, OpenAIComplexContent):
                    content = content.to_anthropic_message()
                    processed_content.append(content)
                else:
                    pass
                    # raise ValueError(f"Invalid content type: {type(content)} Expected OpenAIComplexContent")

            dumped = msg.model_dump()
            dumped["content"] = processed_content
            processed_messages.append(dumped)
        else:
            processed_messages.append(msg)

    client_kwargs = {
        "model": model_name,
        "max_tokens": 2048,
        "messages": processed_messages,
        **kwargs,
    }

    if system:
        client_kwargs["system"] = system

    response = await client.messages.create(**client_kwargs)

    content = response.content[0].text

    return content

masking

image

mask_image_from_bounding_boxes(image, *bounding_boxes, mask_color='#000000')

Create a copy of the image with the positions of the bounding boxes masked.

Source code in docprompt/utils/masking/image.py
def mask_image_from_bounding_boxes(
    image: Image.Image,
    *bounding_boxes: NormBBox,
    mask_color: str = "#000000",
):
    """
    Create a copy of the image with the positions of the bounding boxes masked.
    """

    width, height = image.size

    mask = Image.new("RGBA", (width, height), (0, 0, 0, 0))

    for bbox in bounding_boxes:
        mask.paste(
            Image.new("RGBA", (bbox.width, bbox.height), mask_color),
            (int(bbox.x0 * width), int(bbox.top * height)),
        )

    return Image.alpha_composite(image, mask)

splitter

pdf_split_iter_fast(file_bytes, max_page_count)

Splits a PDF into batches of pages up to max_page_count pages quickly.

Source code in docprompt/utils/splitter.py
def pdf_split_iter_fast(file_bytes: bytes, max_page_count: int) -> Iterator[bytes]:
    """
    Splits a PDF into batches of pages up to `max_page_count` pages quickly.
    """
    with get_pdfium_document(file_bytes) as src_pdf:
        current_page = 0
        total_pages = len(src_pdf)

        while current_page < total_pages:
            # Determine the last page for the current batch
            last_page = min(current_page + max_page_count, total_pages)

            with writable_temp_pdf() as dst_pdf:
                # Append pages to the batch
                dst_pdf.import_pages(src_pdf, list(range(current_page, last_page)))

                # Save the batch PDF to a bytes buffer
                pdf_bytes_buffer = io.BytesIO()
                dst_pdf.save(pdf_bytes_buffer)
                pdf_bytes_buffer.seek(0)  # Reset buffer pointer to the beginning

            # Yield the bytes of the batch PDF
            yield pdf_bytes_buffer.getvalue()

            # Update the current page for the next batch
            current_page += max_page_count

pdf_split_iter_with_max_bytes(file_bytes, max_page_count, max_bytes)

Splits a PDF into batches of pages up to max_page_count pages and max_bytes bytes.

Source code in docprompt/utils/splitter.py
def pdf_split_iter_with_max_bytes(
    file_bytes: bytes, max_page_count: int, max_bytes: int
) -> Iterator[bytes]:
    """
    Splits a PDF into batches of pages up to `max_page_count` pages and `max_bytes` bytes.
    """
    for batch_bytes in pdf_split_iter_fast(file_bytes, max_page_count):
        if len(batch_bytes) <= max_bytes:
            yield batch_bytes
        else:
            # If batch size is greater than max_bytes, reduce the number of pages
            pages_in_batch = max_page_count
            while len(batch_bytes) > max_bytes and pages_in_batch > 1:
                pages_in_batch -= 1
                batch_bytes = next(pdf_split_iter_fast(file_bytes, pages_in_batch))

            if len(batch_bytes) > max_bytes and pages_in_batch == 1:
                # If a single page is still too large, compress it
                with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
                    f.write(batch_bytes)
                    f.flush()
                    compressed_bytes = compress_pdf_to_bytes(f.name)
                yield compressed_bytes
            else:
                yield batch_bytes

split_pdf_to_bytes(file_bytes, *, start_page=None, stop_page=None)

Splits a PDF into a list of bytes.

Source code in docprompt/utils/splitter.py
def split_pdf_to_bytes(
    file_bytes: bytes,
    *,
    start_page: Optional[int] = None,
    stop_page: Optional[int] = None,
):
    """
    Splits a PDF into a list of bytes.
    """
    if start_page is None:
        start_page = 0
    if stop_page is None:
        stop_page = get_page_count(file_bytes)

    if stop_page <= start_page:
        raise ValueError("stop_page must be greater than start_page")

    # Load the PDF from bytes
    with get_pdfium_document(file_bytes) as src_pdf:
        # Create a new PDF for the current batch
        dst_pdf = pdfium.PdfDocument.new()

        # Append pages to the batch
        dst_pdf.import_pages(src_pdf, list(range(start_page, stop_page)))

        # Save the batch PDF to a bytes buffer
        pdf_bytes_buffer = io.BytesIO()
        dst_pdf.save(pdf_bytes_buffer)
        pdf_bytes_buffer.seek(0)  # Reset buffer pointer to the beginning

        # Yield the bytes of the batch PDF
        return pdf_bytes_buffer.getvalue()

util

determine_pdf_name_from_bytes(file_bytes)

Attempts to determine the name of a PDF by exaimining metadata

Source code in docprompt/utils/util.py
def determine_pdf_name_from_bytes(file_bytes: bytes) -> str:
    """
    Attempts to determine the name of a PDF by exaimining metadata
    """
    with get_pdfium_document(file_bytes) as pdf:
        metadata_dict = pdf.get_metadata_dict(skip_empty=True)

    name = None

    if metadata_dict:
        name = (
            metadata_dict.get("Title")
            or metadata_dict.get("Subject")
            or metadata_dict.get("Author")
        )

    if name:
        return f"{name.strip()}.pdf"

    return f"document-{hash_from_bytes(file_bytes)}.pdf"

get_page_count(fd)

Determines the number of pages in a PDF

Source code in docprompt/utils/util.py
def get_page_count(fd: Union[Path, PathLike, bytes]) -> int:
    """
    Determines the number of pages in a PDF
    """
    if not isinstance(fd, bytes):
        with open(fd, "rb") as f:
            fd = f.read()

    with get_pdfium_document(fd) as pdf:
        return len(pdf)

hash_from_bytes(byte_data, hash_func=hashlib.md5, threshold=1024 * 1024 * 128)

Gets a hash from bytes. If the bytes are larger than the threshold, the hash is computed in chunks to avoid memory issues. The default hash function is MD5 with a threshold of 128MB which is optimal for most machines and use cases.

Source code in docprompt/utils/util.py
def hash_from_bytes(
    byte_data: bytes, hash_func=hashlib.md5, threshold=1024 * 1024 * 128
) -> str:
    """
    Gets a hash from bytes. If the bytes are larger than the threshold, the hash is computed in chunks
    to avoid memory issues. The default hash function is MD5 with a threshold of 128MB which is optimal
    for most machines and use cases.
    """
    if len(byte_data) < 1024 * 1024 * 10:  # 10MB
        return hashlib.md5(byte_data).hexdigest()

    hash = hash_func()

    if len(byte_data) > threshold:
        stream = BytesIO(byte_data)
        b = bytearray(128 * 1024)
        mv = memoryview(b)

        while n := stream.readinto(mv):
            hash.update(mv[:n])
    else:
        hash.update(byte_data)

    return hash.hexdigest()

is_pdf(fd)

Determines if a file is a PDF

Source code in docprompt/utils/util.py
def is_pdf(fd: Union[Path, PathLike, bytes]) -> bool:
    """
    Determines if a file is a PDF
    """
    if isinstance(fd, (bytes, str)):
        mime = filetype.guess_mime(fd)
    else:
        with open(fd, "rb") as f:
            # We only need the first 1024 bytes to determine if it's a PDF
            mime = filetype.guess_mime(f.read(1024))

    return mime == "application/pdf"

load_pdf_document(fp, *, file_name=None, password=None)

Loads a document from a file path

Source code in docprompt/utils/util.py
def load_pdf_document(
    fp: Union[Path, PathLike, bytes],
    *,
    file_name: Optional[str] = None,
    password: Optional[str] = None,
) -> PdfDocument:
    """
    Loads a document from a file path
    """
    if isinstance(fp, bytes):
        file_bytes = fp
        file_name = file_name or determine_pdf_name_from_bytes(file_bytes)
    else:
        file_name = name_from_path(fp) if file_name is None else file_name

        file_bytes = read_pdf_bytes_from_path(fp)

    if not is_pdf(file_bytes):
        raise ValueError("File is not a PDF")

    return PdfDocument(
        name=unquote(file_name),
        file_path=str(fp),
        file_bytes=file_bytes,
        password=password,
    )

load_pdf_documents(fps, *, max_threads=12, passwords=None)

Loads multiple documents from file paths, using a thread pool

Source code in docprompt/utils/util.py
def load_pdf_documents(
    fps: List[Union[Path, PathLike, bytes]],
    *,
    max_threads: int = 12,
    passwords: Optional[List[str]] = None,
):
    """
    Loads multiple documents from file paths, using a thread pool
    """
    futures = []

    thread_count = min(max_threads, len(fps))

    with ThreadPoolExecutor(max_workers=thread_count) as executor:
        for fp in fps:
            futures.append(executor.submit(load_document, fp))

    results = []

    for future in as_completed(futures):
        results.append(future.result())

    return results