Skip to content

_pdfium

chunk_iterable(iterable, chunk_size)

Splits an iterable into chunks of specified size, distributing the remainder evenly.

Parameters:

Name Type Description Default
iterable Iterable[T]

The iterable to be chunked.

required
chunk_size int

The desired size of each chunk.

required

Returns:

Type Description
List[List[T]]

List[List[T]]: A list of lists, where each sublist is a chunk.

Source code in docprompt/_pdfium.py
def chunk_iterable(iterable: Iterable[T], chunk_size: int) -> List[List[T]]:
    """
    Splits an iterable into chunks of specified size, distributing the remainder evenly.

    Args:
        iterable (Iterable[T]): The iterable to be chunked.
        chunk_size (int): The desired size of each chunk.

    Returns:
        List[List[T]]: A list of lists, where each sublist is a chunk.
    """
    # Convert the iterable to a list
    items = list(iterable)
    total_items = len(items)

    # Calculate the number of chunks needed
    num_chunks = (total_items + chunk_size - 1) // chunk_size

    # Calculate the ideal size of each chunk
    ideal_chunk_size = total_items // num_chunks
    remainder = total_items % num_chunks

    # Create the chunks
    chunks = []
    start = 0
    for i in range(num_chunks):
        end = start + ideal_chunk_size + (1 if i < remainder else 0)
        chunks.append(items[start:end])
        start = end

    return chunks

get_pdfium_document(fp, password=None)

Loads a PDF document with a lock to prevent race conditions in threaded environments

Source code in docprompt/_pdfium.py
@contextmanager
def get_pdfium_document(
    fp: Union[PathLike, Path, bytes, str], password: Optional[str] = None
):
    """
    Loads a PDF document with a lock to prevent race conditions in threaded environments
    """
    with PDFIUM_LOAD_LOCK:
        pdf = pdfium.PdfDocument(fp, password=password, autoclose=False)
    try:
        yield pdf
    finally:
        pdf.close()

rasterize_page_with_pdfium(fp, page_number, *, return_mode='pil', post_process_fn=None, **kwargs)

Rasterizes a page of a PDF document

Source code in docprompt/_pdfium.py
def rasterize_page_with_pdfium(
    fp: Union[PathLike, Path, bytes],
    page_number: int,
    *,
    return_mode: Literal["pil", "bytes"] = "pil",
    post_process_fn: Optional[Callable[[Image.Image], Image.Image]] = None,
    **kwargs,
) -> Union[Image.Image, bytes]:
    """
    Rasterizes a page of a PDF document
    """
    with get_pdfium_document(fp) as pdf:
        return _render_job(
            page_number - 1,
            pdf,
            kwargs,
            return_mode=return_mode,
            post_process_fn=post_process_fn,
        )

rasterize_pdf_with_pdfium(fp, password=None, *, return_mode='pil', post_process_fn=None, **kwargs)

Rasterizes an entire PDF using PDFium and a pool of workers

Source code in docprompt/_pdfium.py
def rasterize_pdf_with_pdfium(
    fp: Union[PathLike, Path, bytes],
    password: Optional[str] = None,
    *,
    return_mode: Literal["pil", "bytes"] = "pil",
    post_process_fn: Optional[Callable[[Image.Image], Image.Image]] = None,
    **kwargs,
) -> List[Union[Image.Image, bytes]]:
    """
    Rasterizes an entire PDF using PDFium and a pool of workers
    """
    with get_pdfium_document(fp, password=password) as pdf:
        total_pages = len(pdf)

    max_workers = min(mp.cpu_count(), total_pages)

    ctx = mp.get_context("spawn")

    with potential_temporary_file(fp) as temp_fp:
        initargs = (
            None,
            temp_fp,
            password,
            False,
            kwargs,
            return_mode,
            post_process_fn,
        )

        with ft.ProcessPoolExecutor(
            max_workers=max_workers,
            initializer=_render_parallel_init,
            initargs=initargs,
            mp_context=ctx,
        ) as executor:
            results = executor.map(
                _render_parallel_job, range(total_pages), chunksize=1
            )

        return list(results)

rasterize_pdfs_with_pdfium(fps, passwords=None, *, return_mode='pil', post_process_fn=None, **kwargs)

Like 'rasterize_pdf_with_pdfium', but optimized for multiple PDFs by loading all PDF's into the workers memory space

Source code in docprompt/_pdfium.py
def rasterize_pdfs_with_pdfium(
    fps: List[Union[PathLike, Path, bytes]],
    passwords: Optional[List[str]] = None,
    *,
    return_mode: Literal["pil", "bytes"] = "pil",
    post_process_fn: Optional[Callable[[Image.Image], Image.Image]] = None,
    **kwargs,
) -> Dict[int, Dict[int, Union[Image.Image, bytes]]]:
    """
    Like 'rasterize_pdf_with_pdfium', but optimized for multiple PDFs by loading all PDF's into the workers memory space
    """
    if passwords and len(passwords) != len(fps):
        raise ValueError(
            "If specifying passwords, must provide one for each PDF. Use None for no password."
        )
    passwords = passwords or [None] * len(fps)

    ctx = mp.get_context("spawn")

    with tempfile.TemporaryDirectory(prefix="docprompt_raster_tmp") as tempdir:
        writable_fps = _get_writable_temp_fp_paths(fps, tempdir)
        page_counts = _get_page_counts_from_pdfs(writable_fps)
        total_to_process = sum(page_counts)

        max_workers = min(mp.cpu_count(), total_to_process)

        pdf_page_map = dict(enumerate(page_counts))
        name_to_idx = {fp: i for i, fp in enumerate(writable_fps)}

        core_pdf_assignments = distribute_pdfs(pdf_page_map, max_workers)

        with mp.Manager() as manager:
            mp_queue = manager.Queue()

            processes = []

            with tqdm(total=total_to_process, desc="Rasterizing PDF's") as pbar:
                for core_id, pdf_page_map in core_pdf_assignments.items():
                    data = {
                        (writable_fps[i], passwords[i]): pages
                        for i, pages in pdf_page_map.items()
                    }

                    p = ctx.Process(
                        target=process_work,
                        args=(data, post_process_fn, return_mode, mp_queue),
                    )
                    p.start()
                    processes.append(p)

                results: Dict[int, Dict[int, Union[Image.Image, bytes]]] = {}

                while any(p.is_alive() for p in processes) or not mp_queue.empty():
                    try:
                        pdf, page, result = mp_queue.get(timeout=0.5)
                        i = name_to_idx[pdf]
                        results.setdefault(i, {})[page] = result
                        pbar.update(1)
                    except queue.Empty:
                        pass

    return results