Skip to content


insert_generator(bboxes, data=None)

Make an iterator that yields tuples of (id, bbox, data) for insertion into an RTree index which improves performance massively.

Source code in docprompt/provenance/
def insert_generator(bboxes: List[NormBBox], data: Optional[Iterable[Any]] = None):
    Make an iterator that yields tuples of (id, bbox, data) for insertion into an RTree index
    which improves performance massively.
    data = data or [None] * len(bboxes)

    for idx, (bbox, data_item) in enumerate(zip(bboxes, data)):
        yield (idx, construct_valid_rtree_tuple(bbox), data_item)


Improve matching ability by applying some preprocessing to the query text.

Source code in docprompt/provenance/
def preprocess_query_text(text: str) -> str:
    Improve matching ability by applying some preprocessing to the query text.
    for regex in _prefix_regexs:
        text = regex.sub("", text)

    text = text.strip()

    text = text.replace('"', "")

    return text

refine_block_to_word_level(source_block, intersecting_word_level_blocks, query)

Create a new text block by merging the intersecting word level blocks that match the query.

Source code in docprompt/provenance/
def refine_block_to_word_level(
    source_block: TextBlock,
    intersecting_word_level_blocks: List[TextBlock],
    query: str,
    Create a new text block by merging the intersecting word level blocks that
    match the query.

        key=lambda x: (, x.bounding_box.x0)

    tokenized_query = word_tokenize(query)

    if len(tokenized_query) == 1:
        fuzzified = default_process(tokenized_query[0])
        for word_level_block in intersecting_word_level_blocks:
            if fuzz.ratio(fuzzified, default_process(word_level_block.text)) > 87.5:
                return word_level_block, [word_level_block]
        fuzzified_word_level_texts = [
            for word_level_block in intersecting_word_level_blocks

        # Populate the block mapping
        token_block_mapping = defaultdict(set)

        first_word = tokenized_query[0]
        last_word = tokenized_query[-1]

        for token in tokenized_query:
            fuzzified_token = default_process(token)
            for i, word_level_block in enumerate(intersecting_word_level_blocks):
                if fuzz.ratio(fuzzified_token, fuzzified_word_level_texts[i]) > 87.5:

        graph = networkx.DiGraph()
        prev = tokenized_query[0]

        for i in token_block_mapping[prev]:

        for token in tokenized_query[1:]:
            for prev_block in token_block_mapping[prev]:
                for block in sorted(token_block_mapping[token]):
                    if block > prev_block:
                        weight = (
                            (block - prev_block) ** 2
                        )  # Square the distance to penalize large jumps, which encourages reading order
                        graph.add_edge(prev_block, block, weight=weight)

            prev = token

        # Get every combination of first and last word
        first_word_blocks = token_block_mapping[first_word]
        last_word_blocks = token_block_mapping[last_word]

        combinations = sorted(
            [(x, y) for x in first_word_blocks for y in last_word_blocks if x < y],
            key=lambda x: abs(x[1] - x[0]),

        for start, end in combinations:
                path = networkx.shortest_path(graph, start, end, weight="weight")
            except networkx.NetworkXNoPath:
            except Exception:

            matching_blocks = [intersecting_word_level_blocks[i] for i in path]

            merged_bbox = NormBBox.combine(
                *[word_level_block.bounding_box for word_level_block in matching_blocks]

            merged_text = ""

            for word_level_block in matching_blocks:
                merged_text += word_level_block.text
                if not word_level_block.text.endswith(" "):
                    merged_text += " "  # Ensure there is a space between words

            return (


Tokenize a string into words.

Source code in docprompt/provenance/
def word_tokenize(text: str) -> List[str]:
    Tokenize a string into words.
    return re.split(r"\s+", text)