Skip to content

Index

base

AbstractDocumentTaskProvider

Bases: AbstractTaskProvider

A task provider performs a specific, repeatable task on a document.

Source code in docprompt/tasks/base.py
class AbstractDocumentTaskProvider(AbstractTaskProvider):
    """
    A task provider performs a specific, repeatable task on a document.
    """

    capabilities: ClassVar[List[DocumentLevelCapabilities]]

    # NOTE: We need the stubs defined here for the flexible decorators to work
    # for now

    class Meta:
        abstract = True

AbstractPageTaskProvider

Bases: AbstractTaskProvider

A page task provider performs a specific, repeatable task on a page.

Source code in docprompt/tasks/base.py
class AbstractPageTaskProvider(AbstractTaskProvider):
    """
    A page task provider performs a specific, repeatable task on a page.
    """

    capabilities: ClassVar[List[PageLevelCapabilities]]

    # NOTE: We need the stubs defined here for the flexible decorators to work
    # for now

    class Meta:
        abstract = True

AbstractTaskProvider

Bases: BaseModel, Generic[TTaskInput, TTaskConfig, TTaskResult]

A task provider performs a specific, repeatable task on a document or its pages.

NOTE: Either the process_document_pages or aprocess_document_pages method must be implemented in a valid subclass. The process_document_pages method is explicitly defined, while the aprocess_document_pages method is an async version of the same method.

If you wish to provide seperate implementations for sync and async, you can define both methods individually, and they will each use their own custom implementation when called. Otherwise, if you only implement one or the other of a flexible method pair, the other will automatically be generated and provided for you at runtime.

Source code in docprompt/tasks/base.py
@flexible_methods(
    ("process_document_node", "aprocess_document_node"),
    ("_invoke", "_ainvoke"),
)
class AbstractTaskProvider(BaseModel, Generic[TTaskInput, TTaskConfig, TTaskResult]):
    """
    A task provider performs a specific, repeatable task on a document or its pages.

    NOTE: Either the `process_document_pages` or `aprocess_document_pages` method must be implemented in
    a valid subclass. The `process_document_pages` method is explicitly defined, while the `aprocess_document_pages`
    method is an async version of the same method.

    If you wish to provide seperate implementations for sync and async, you can define both methods individually, and
    they will each use their own custom implementation when called. Otherwise, if you only implement one or the other of
    a flexible method pair, the other will automatically be generated and provided for you at runtime.
    """

    name: ClassVar[str]
    capabilities: ClassVar[List[Capabilites]]

    # TODO: Potentially utilize context here during instantiation from Factory??
    _default_invoke_kwargs: Dict[str, str] = PrivateAttr()

    class Meta:
        """The meta class is utilized by the flexible methods decorator.

        For all classes that are not concrete implementations, we should set the
        abstract attribute to True, which will prevent the check from failing when
        the flexible methods decorator is looking for the implementation of the
        methods.
        """

        abstract = True

    def __init__(self, invoke_kwargs: Dict[str, str] = None, **data):
        with init_context({"invoke_kwargs": invoke_kwargs or {}}):
            self.__pydantic_validator__.validate_python(
                data,
                self_instance=self,
                context=_init_context_var.get(),
            )

    @model_validator(mode="before")
    @classmethod
    def validate_class_vars(cls, data: Any) -> Any:
        """
        Ensure that the class has a name and capabilities defined.
        """

        if not hasattr(cls, "name"):
            raise ValueError("Task providers must have a name defined")

        if not hasattr(cls, "capabilities"):
            raise ValueError("Task providers must have capabilities defined")

        if not cls.capabilities:
            raise ValueError("Task providers must have at least one capability defined")

        return data

    @model_validator(mode="after")
    def set_invoke_kwargs(self, info: ValidationInfo) -> Self:
        """
        Set the default invoke kwargs for the task provider.
        """
        self._default_invoke_kwargs = info.context["invoke_kwargs"]
        return self

    async def _ainvoke(
        self,
        input: Iterable[TTaskInput],
        config: Optional[TTaskConfig] = None,
        **kwargs,
    ) -> List[TTaskResult]:
        raise NotImplementedError

    async def ainvoke(
        self,
        input: Iterable[TTaskInput],
        config: Optional[TTaskConfig] = None,
        **kwargs,
    ) -> List[TTaskResult]:
        invoke_kwargs = {
            **self._default_invoke_kwargs,
            **kwargs,
        }

        return await self._ainvoke(input, config, **invoke_kwargs)

    def _invoke(
        self,
        input: Iterable[TTaskInput],
        config: Optional[TTaskConfig] = None,
        **kwargs,
    ) -> List[TTaskResult]:
        raise NotImplementedError

    def invoke(
        self,
        input: Iterable[TTaskInput],
        config: Optional[TTaskConfig] = None,
        **kwargs,
    ) -> List[TTaskResult]:
        invoke_kwargs = {
            **self._default_invoke_kwargs,
            **kwargs,
        }

        return self._invoke(input, config, **invoke_kwargs)

    def process_document_node(
        self,
        document_node: "DocumentNode",
        task_config: Optional[TTaskConfig] = None,
        start: Optional[int] = None,
        stop: Optional[int] = None,
        contribute_to_document: bool = True,
        **kwargs,
    ) -> Dict[int, TTaskResult]:
        raise NotImplementedError

    async def aprocess_document_node(
        self,
        document_node: "DocumentNode",
        task_config: Optional[TTaskConfig] = None,
        start: Optional[int] = None,
        stop: Optional[int] = None,
        contribute_to_document: bool = True,
        **kwargs,
    ) -> Dict[int, TTaskResult]:
        raise NotImplementedError

Meta

The meta class is utilized by the flexible methods decorator.

For all classes that are not concrete implementations, we should set the abstract attribute to True, which will prevent the check from failing when the flexible methods decorator is looking for the implementation of the methods.

Source code in docprompt/tasks/base.py
class Meta:
    """The meta class is utilized by the flexible methods decorator.

    For all classes that are not concrete implementations, we should set the
    abstract attribute to True, which will prevent the check from failing when
    the flexible methods decorator is looking for the implementation of the
    methods.
    """

    abstract = True

set_invoke_kwargs(info)

Set the default invoke kwargs for the task provider.

Source code in docprompt/tasks/base.py
@model_validator(mode="after")
def set_invoke_kwargs(self, info: ValidationInfo) -> Self:
    """
    Set the default invoke kwargs for the task provider.
    """
    self._default_invoke_kwargs = info.context["invoke_kwargs"]
    return self

validate_class_vars(data) classmethod

Ensure that the class has a name and capabilities defined.

Source code in docprompt/tasks/base.py
@model_validator(mode="before")
@classmethod
def validate_class_vars(cls, data: Any) -> Any:
    """
    Ensure that the class has a name and capabilities defined.
    """

    if not hasattr(cls, "name"):
        raise ValueError("Task providers must have a name defined")

    if not hasattr(cls, "capabilities"):
        raise ValueError("Task providers must have capabilities defined")

    if not cls.capabilities:
        raise ValueError("Task providers must have at least one capability defined")

    return data

capabilities

PageLevelCapabilities

Bases: str, Enum

Represents a capability that a provider can fulfill

Source code in docprompt/tasks/capabilities.py
class PageLevelCapabilities(str, Enum):
    """
    Represents a capability that a provider can fulfill
    """

    PAGE_RASTERIZATION = "page-rasterization"
    PAGE_LAYOUT_OCR = "page-layout-ocr"
    PAGE_TEXT_OCR = "page-text-ocr"
    PAGE_CLASSIFICATION = "page-classification"
    PAGE_MARKERIZATION = "page-markerization"
    PAGE_SEGMENTATION = "page-segmentation"
    PAGE_VQA = "page-vqa"
    PAGE_TABLE_IDENTIFICATION = "page-table-identification"
    PAGE_TABLE_EXTRACTION = "page-table-extraction"

classification

anthropic

The antrhopic implementation of page level calssification.

AnthropicClassificationProvider

Bases: BaseClassificationProvider

The Anthropic implementation of unscored page classification.

Source code in docprompt/tasks/classification/anthropic.py
class AnthropicClassificationProvider(BaseClassificationProvider):
    """The Anthropic implementation of unscored page classification."""

    name = "anthropic"

    async def _ainvoke(
        self, input: Iterable[bytes], config: ClassificationConfig = None, **kwargs
    ) -> List[ClassificationOutput]:
        messages = _prepare_messages(input, config)

        parser = AnthropicPageClassificationOutputParser.from_task_input(
            config, provider_name=self.name
        )

        completions = await inference.run_batch_inference_anthropic(messages)

        return [parser.parse(res) for res in completions]

AnthropicPageClassificationOutputParser

Bases: BasePageClassificationOutputParser

The output parser for the page classification system.

Source code in docprompt/tasks/classification/anthropic.py
class AnthropicPageClassificationOutputParser(BasePageClassificationOutputParser):
    """The output parser for the page classification system."""

    def parse(self, text: str) -> ClassificationOutput:
        """Parse the results of the classification task."""
        pattern = re.compile(r"Answer: (.+)")
        match = pattern.search(text)

        result = self.resolve_match(match)

        if self.confidence:
            conf_pattern = re.compile(r"Confidence: (.+)")
            conf_match = conf_pattern.search(text)
            conf_result = self.resolve_confidence(conf_match)

            return ClassificationOutput(
                type=self.type,
                labels=result,
                score=conf_result,
                provider_name=self.name,
            )

        return ClassificationOutput(
            type=self.type, labels=result, provider_name=self.name
        )
parse(text)

Parse the results of the classification task.

Source code in docprompt/tasks/classification/anthropic.py
def parse(self, text: str) -> ClassificationOutput:
    """Parse the results of the classification task."""
    pattern = re.compile(r"Answer: (.+)")
    match = pattern.search(text)

    result = self.resolve_match(match)

    if self.confidence:
        conf_pattern = re.compile(r"Confidence: (.+)")
        conf_match = conf_pattern.search(text)
        conf_result = self.resolve_confidence(conf_match)

        return ClassificationOutput(
            type=self.type,
            labels=result,
            score=conf_result,
            provider_name=self.name,
        )

    return ClassificationOutput(
        type=self.type, labels=result, provider_name=self.name
    )

base

BaseClassificationProvider

Bases: AbstractPageTaskProvider[bytes, ClassificationConfig, ClassificationOutput]

The base classification provider.

Source code in docprompt/tasks/classification/base.py
class BaseClassificationProvider(
    AbstractPageTaskProvider[bytes, ClassificationConfig, ClassificationOutput]
):
    """
    The base classification provider.
    """

    capabilities = [PageLevelCapabilities.PAGE_CLASSIFICATION]

    class Meta:
        abstract = True

    def process_document_node(
        self,
        document_node: "DocumentNode",
        task_config: ClassificationConfig = None,
        start: Optional[int] = None,
        stop: Optional[int] = None,
        contribute_to_document: bool = True,
        **kwargs,
    ):
        assert (
            task_config is not None
        ), "task_config must be provided for classification tasks"

        raster_bytes = []
        for page_number in range(start or 1, (stop or len(document_node)) + 1):
            image_bytes = document_node.page_nodes[
                page_number - 1
            ].rasterizer.rasterize("default")
            raster_bytes.append(image_bytes)

        # TODO: This is a somewhat dangerous way of requiring these kwargs to be drilled
        # through, potentially a decorator solution to be had here
        kwargs = {**self._default_invoke_kwargs, **kwargs}
        results = self._invoke(raster_bytes, config=task_config, **kwargs)

        return {
            i: res
            for i, res in zip(
                range(start or 1, (stop or len(document_node)) + 1), results
            )
        }

BasePageClassificationOutputParser

Bases: ABC, BaseOutputParser[ClassificationConfig, ClassificationOutput]

The output parser for the page classification system.

Source code in docprompt/tasks/classification/base.py
class BasePageClassificationOutputParser(
    ABC, BaseOutputParser[ClassificationConfig, ClassificationOutput]
):
    """The output parser for the page classification system."""

    name: str = Field(...)
    type: ClassificationTypes = Field(...)
    labels: LabelType = Field(...)
    confidence: bool = Field(False)

    @classmethod
    def from_task_input(cls, task_input: ClassificationConfig, provider_name: str):
        return cls(
            type=task_input.type,
            name=provider_name,
            labels=task_input.labels,
            confidence=task_input.confidence,
        )

    def resolve_match(self, _match: Union[re.Match, None]) -> LabelType:
        """Get the regex pattern for the output parser."""

        if not _match:
            raise ValueError("Could not find the answer in the text.")

        val = _match.group(1)
        if self.type == ClassificationTypes.BINARY:
            if val not in self.labels:
                raise ValueError(f"Invalid label: {val}")
            return val

        elif self.type == ClassificationTypes.SINGLE_LABEL:
            if val not in self.labels:
                raise ValueError(f"Invalid label: {val}")
            return val

        elif self.type == ClassificationTypes.MULTI_LABEL:
            labels = val.split(", ")
            for label in labels:
                if label not in self.labels:
                    raise ValueError(f"Invalid label: {label}")
            return labels
        else:
            raise ValueError(f"Invalid classification type: {self.type}")

    def resolve_confidence(self, _match: Union[re.Match, None]) -> ConfidenceLevel:
        """Get the confidence level from the text."""

        if not _match:
            return None

        val = _match.group(1).lower()

        return ConfidenceLevel(val)

    @abstractmethod
    def parse(self, text: str) -> ClassificationOutput: ...
resolve_confidence(_match)

Get the confidence level from the text.

Source code in docprompt/tasks/classification/base.py
def resolve_confidence(self, _match: Union[re.Match, None]) -> ConfidenceLevel:
    """Get the confidence level from the text."""

    if not _match:
        return None

    val = _match.group(1).lower()

    return ConfidenceLevel(val)
resolve_match(_match)

Get the regex pattern for the output parser.

Source code in docprompt/tasks/classification/base.py
def resolve_match(self, _match: Union[re.Match, None]) -> LabelType:
    """Get the regex pattern for the output parser."""

    if not _match:
        raise ValueError("Could not find the answer in the text.")

    val = _match.group(1)
    if self.type == ClassificationTypes.BINARY:
        if val not in self.labels:
            raise ValueError(f"Invalid label: {val}")
        return val

    elif self.type == ClassificationTypes.SINGLE_LABEL:
        if val not in self.labels:
            raise ValueError(f"Invalid label: {val}")
        return val

    elif self.type == ClassificationTypes.MULTI_LABEL:
        labels = val.split(", ")
        for label in labels:
            if label not in self.labels:
                raise ValueError(f"Invalid label: {label}")
        return labels
    else:
        raise ValueError(f"Invalid classification type: {self.type}")

ClassificationConfig

Bases: BaseModel

Source code in docprompt/tasks/classification/base.py
class ClassificationConfig(BaseModel):
    type: ClassificationTypes
    labels: LabelType
    descriptions: Optional[List[str]] = Field(
        None, description="The descriptions for each label (if any)."
    )

    instructions: Optional[str] = Field(
        None,
        description="Additional instructions to pass to the LLM for the task. Required for Binary Classification.",
    )

    confidence: bool = Field(False)

    @model_validator(mode="before")
    def validate_label_bindings(cls, data: Any) -> Any:
        """Validate the the label/description bindings based on the type."""

        classification_type = data.get("type", None)
        if classification_type == ClassificationTypes.SINGLE_LABEL:
            labels = data.get("labels", None)
            if not labels:
                raise ValueError(
                    "labels must be provided for single_label classification"
                )
            return data

        elif classification_type == ClassificationTypes.BINARY:
            instructions = data.get("instructions", None)
            if not instructions:
                raise ValueError(
                    "instructions must be provided for binary classification"
                )
            data["labels"] = ["YES", "NO"]
            return data

        elif classification_type == ClassificationTypes.MULTI_LABEL:
            labels = data.get("labels", None)
            if not labels:
                raise ValueError(
                    "labels must be provided for multi_label classification"
                )
            return data

    @model_validator(mode="after")
    def validate_descriptions_length(self):
        if self.descriptions is not None:
            labels = self.labels
            if labels is not None and len(self.descriptions) != len(labels):
                raise ValueError("descriptions must have the same length as labels")
        return self

    @property
    def formatted_labels(self):
        """Produce the formatted labels for the prompt template."""
        raw_labels = self.labels
        if self.descriptions:
            for label, description in zip(raw_labels, self.descriptions):
                yield f"{label}: {description}"
        else:
            yield from raw_labels
formatted_labels property

Produce the formatted labels for the prompt template.

validate_label_bindings(data)

Validate the the label/description bindings based on the type.

Source code in docprompt/tasks/classification/base.py
@model_validator(mode="before")
def validate_label_bindings(cls, data: Any) -> Any:
    """Validate the the label/description bindings based on the type."""

    classification_type = data.get("type", None)
    if classification_type == ClassificationTypes.SINGLE_LABEL:
        labels = data.get("labels", None)
        if not labels:
            raise ValueError(
                "labels must be provided for single_label classification"
            )
        return data

    elif classification_type == ClassificationTypes.BINARY:
        instructions = data.get("instructions", None)
        if not instructions:
            raise ValueError(
                "instructions must be provided for binary classification"
            )
        data["labels"] = ["YES", "NO"]
        return data

    elif classification_type == ClassificationTypes.MULTI_LABEL:
        labels = data.get("labels", None)
        if not labels:
            raise ValueError(
                "labels must be provided for multi_label classification"
            )
        return data

ConfidenceLevel

Bases: str, Enum

The confidence level of the classification.

Source code in docprompt/tasks/classification/base.py
class ConfidenceLevel(str, Enum):
    """The confidence level of the classification."""

    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"

credentials

The credentials module defines a simple model schema for storing credentials.

APIKeyCredential

Bases: BaseCredentials

The API key credential model.

Source code in docprompt/tasks/credentials.py
class APIKeyCredential(BaseCredentials):
    """The API key credential model."""

    api_key: SecretStr = Field(...)

    def __init__(self, environ_path: Optional[str] = None, **data):
        api_key = data.get("api_key", None)
        if api_key is None and environ_path:
            api_key = os.environ.get(environ_path, None)
        super().__init__(api_key=api_key)

AWSCredentials

Bases: BaseCredentials

The AWS credentials model.

Source code in docprompt/tasks/credentials.py
class AWSCredentials(BaseCredentials):
    """The AWS credentials model."""

    aws_access_key_id: Optional[SecretStr] = Field(None)
    aws_secret_access_key: Optional[SecretStr] = Field(None)
    aws_session_token: Optional[SecretStr] = Field(None)
    aws_region: Optional[str] = Field(None)

    def __init__(self, **data):
        aws_access_key_id = data.get(
            "aws_access_key_id", os.environ.get("AWS_ACCESS_KEY_ID", None)
        )
        aws_secret_access_key = data.get(
            "aws_secret_access_key", os.environ.get("AWS_SECRET_ACCESS_KEY", None)
        )
        aws_session_token = data.get(
            "aws_session_token", os.environ.get("AWS_SESSION_TOKEN", None)
        )
        aws_region = data.get("aws_region", os.environ.get("AWS_DEFAULT_REGION", None))
        super().__init__(
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            aws_session_token=aws_session_token,
            aws_region=aws_region,
        )

    @model_validator(mode="after")
    def _validate_aws_credentials(self) -> Self:
        """Ensure the provided AWS credentials are valid."""

        key_pair_is_set = self.aws_access_key_id and self.aws_secret_access_key

        if not key_pair_is_set and not self.aws_session_token:
            raise ValueError(
                "You must provide either an AWS session token or an access key and secret key."
            )

        if key_pair_is_set and not self.aws_region:
            raise ValueError(
                "You must provide an AWS region when using an access key and secret key."
            )

        if key_pair_is_set and self.aws_session_token:
            raise ValueError(
                "You cannot provide both an AWS session token and an access key and secret key."
            )

        return self

BaseCredentials

Bases: BaseModel

The base credentials model.

Source code in docprompt/tasks/credentials.py
class BaseCredentials(BaseModel):
    """The base credentials model."""

    @property
    def kwargs(self) -> Dict[str, str]:
        """Return the credentials as a dictionary with secrets exposed."""
        data = self.model_dump(exclude_none=True)
        for key, value in data.items():
            if isinstance(value, SecretStr):
                data[key] = value.get_secret_value()
        return data

kwargs: Dict[str, str] property

Return the credentials as a dictionary with secrets exposed.

GCPServiceFileCredentials

Bases: BaseCredentials

The GCP service account credentials model.

Source code in docprompt/tasks/credentials.py
class GCPServiceFileCredentials(BaseCredentials):
    """The GCP service account credentials model."""

    service_account_info: Optional[Dict[str, str]] = Field(None)
    service_account_file: Optional[str] = Field(None)

    def __init__(self, **data):
        service_account_info = data.get("service_account_info", None)
        service_account_file = data.get(
            "service_account_file", os.environ.get("GCP_SERVICE_ACCOUNT_FILE", None)
        )

        super().__init__(
            service_account_info=service_account_info,
            service_account_file=service_account_file,
        )

    @model_validator(mode="after")
    def _validate_gcp_credentials(self) -> Self:
        """Ensure the provided GCP credentials are valid."""
        if self.service_account_info is None and self.service_account_file is None:
            raise ValueError(
                "You must provide either service_account_info or service_account_file. You may set the `GCP_SERVICE_ACCOUNT_FILE` environment variable to the path of the service account file."
            )
        if (
            self.service_account_info is not None
            and self.service_account_file is not None
        ):
            raise ValueError(
                "You must provide either service_account_info or service_account_file, not both"
            )
        return self

factory

Define the base factory for creating task providers.

AbstractTaskMixin

Bases: ABC

Base class for all task mixins.

Source code in docprompt/tasks/factory.py
class AbstractTaskMixin(ABC):
    """Base class for all task mixins."""

    tags: ClassVar[List[Union[PageLevelCapabilities, DocumentLevelCapabilities]]]

AbstractTaskProviderFactory

Bases: ABC, BaseModel

The abstract interface for a provider task factory.

We need to define the basic interface for how we can create task providers. The task provider factory is responsible for allowing the creation of task providers for specific backends, (i.e. Anthropic, OpenAI, etc.)

Source code in docprompt/tasks/factory.py
class AbstractTaskProviderFactory(ABC, BaseModel):
    """The abstract interface for a provider task factory.

    We need to define the basic interface for how we can create task providers. The task provider factory
    is responsible for allowing the creation of task providers for specific backends, (i.e. Anthropic, OpenAI, etc.)
    """

    def __init__(self, **data):
        with init_context({"payload": data}):
            self.__pydantic_validator__.validate_python(
                data,
                self_instance=self,
                context=_init_context_var.get(),
            )

    @abstractmethod
    @model_validator(mode="after")
    def _validate_provider(self) -> Self:
        """Validate the provider before returning it.

        This method needs to handle credential validation, to ensure that the provider is properly
        configured and can be utilized for the tasks it can be used to provide.
        """

AmazonTaskProviderFactory

Bases: AbstractTaskProviderFactory, PageOCRMixin

The task provider factory for Amazon.

Source code in docprompt/tasks/factory.py
class AmazonTaskProviderFactory(AbstractTaskProviderFactory, PageOCRMixin):
    """The task provider factory for Amazon."""

    @model_validator(mode="after")
    def _validate_provider(self, info: ValidationInfo) -> Self:
        """Validate the provider before returning it."""
        _payload = info.context["payload"]
        self._credentials = AWSCredentials(**_payload)

    def get_page_ocr_provider(self, **kwargs) -> TTaskProvider:
        """Get the page OCR provider."""
        from docprompt.tasks.ocr.amazon import AmazonTextractOCRProvider

        return AmazonTextractOCRProvider(
            invoke_kwargs=self._credentials.kwargs, **kwargs
        )

get_page_ocr_provider(**kwargs)

Get the page OCR provider.

Source code in docprompt/tasks/factory.py
def get_page_ocr_provider(self, **kwargs) -> TTaskProvider:
    """Get the page OCR provider."""
    from docprompt.tasks.ocr.amazon import AmazonTextractOCRProvider

    return AmazonTextractOCRProvider(
        invoke_kwargs=self._credentials.kwargs, **kwargs
    )

AnthropicTaskProviderFactory

Bases: AbstractTaskProviderFactory, PageClassificationMixin, PageMarkerizationMixin, PageTableExtractionMixin

The task provider factory for Anthropic.

NOTE: We can either utilize the standard Anthropic API or we can utilize AWS Bedrock. In the event that a user wants to utilize the standard Anthropic API.

Source code in docprompt/tasks/factory.py
class AnthropicTaskProviderFactory(
    AbstractTaskProviderFactory,
    PageClassificationMixin,
    PageMarkerizationMixin,
    PageTableExtractionMixin,
):
    """The task provider factory for Anthropic.

    NOTE: We can either utilize the standard Anthropic API or we can utilize AWS Bedrock. In the event
    that a user wants to utilize the standard Anthropic API.
    """

    _credentials: APIKeyCredential = PrivateAttr()

    @model_validator(mode="after")
    def _validate_provider(self, info: ValidationInfo) -> Self:
        """Validate the provider before returning it."""
        _payload = info.context["payload"]
        self._credentials = APIKeyCredential(
            environ_path="ANTHROPIC_API_KEY", **_payload
        )
        return self

    def get_page_classification_provider(self, **kwargs) -> TTaskProvider:
        """Get the page classification provider."""
        from docprompt.tasks.classification.anthropic import (
            AnthropicClassificationProvider,
        )

        return AnthropicClassificationProvider(invoke_kwargs=self._credentials.kwargs)

    def get_page_table_extraction_provider(self, **kwargs) -> TTaskProvider:
        """Get the page table extraction provider."""
        from docprompt.tasks.table_extraction.anthropic import (
            AnthropicTableExtractionProvider,
        )

        return AnthropicTableExtractionProvider(
            invoke_kwargs=self._credentials.kwargs, **kwargs
        )

    def get_page_markerization_provider(self, **kwargs) -> TTaskProvider:
        """Get the page markerization provider."""
        from docprompt.tasks.markerize.anthropic import AnthropicMarkerizeProvider

        return AnthropicMarkerizeProvider(
            invoke_kwargs=self._credentials.kwargs, **kwargs
        )

get_page_classification_provider(**kwargs)

Get the page classification provider.

Source code in docprompt/tasks/factory.py
def get_page_classification_provider(self, **kwargs) -> TTaskProvider:
    """Get the page classification provider."""
    from docprompt.tasks.classification.anthropic import (
        AnthropicClassificationProvider,
    )

    return AnthropicClassificationProvider(invoke_kwargs=self._credentials.kwargs)

get_page_markerization_provider(**kwargs)

Get the page markerization provider.

Source code in docprompt/tasks/factory.py
def get_page_markerization_provider(self, **kwargs) -> TTaskProvider:
    """Get the page markerization provider."""
    from docprompt.tasks.markerize.anthropic import AnthropicMarkerizeProvider

    return AnthropicMarkerizeProvider(
        invoke_kwargs=self._credentials.kwargs, **kwargs
    )

get_page_table_extraction_provider(**kwargs)

Get the page table extraction provider.

Source code in docprompt/tasks/factory.py
def get_page_table_extraction_provider(self, **kwargs) -> TTaskProvider:
    """Get the page table extraction provider."""
    from docprompt.tasks.table_extraction.anthropic import (
        AnthropicTableExtractionProvider,
    )

    return AnthropicTableExtractionProvider(
        invoke_kwargs=self._credentials.kwargs, **kwargs
    )

DocumentVQAMixin

Bases: AbstractTaskMixin, Generic[TTaskProvider]

Mixin for multi-page document VQA task.

Source code in docprompt/tasks/factory.py
class DocumentVQAMixin(AbstractTaskMixin, Generic[TTaskProvider]):
    """Mixin for multi-page document VQA task."""

    tags = [DocumentLevelCapabilities.DOCUMENT_VQA]

    @abstractmethod
    def get_document_vqa_provider(self, *args, **kwargs) -> TTaskProvider:
        """Perform multi-page document VQA."""

get_document_vqa_provider(*args, **kwargs) abstractmethod

Perform multi-page document VQA.

Source code in docprompt/tasks/factory.py
@abstractmethod
def get_document_vqa_provider(self, *args, **kwargs) -> TTaskProvider:
    """Perform multi-page document VQA."""

GCPTaskProviderFactory

Bases: AbstractTaskProviderFactory, PageOCRMixin

The task provider factory for GCP.

Source code in docprompt/tasks/factory.py
class GCPTaskProviderFactory(
    AbstractTaskProviderFactory,
    PageOCRMixin,
):
    """The task provider factory for GCP."""

    @model_validator(mode="after")
    def _validate_provider(self, info: ValidationInfo) -> Self:
        """Validate the provider before returning it."""
        _payload = info.context["payload"]
        self._credentials = GCPServiceFileCredentials(**_payload)
        return self

    def get_page_ocr_provider(
        self, project_id: str, processor_id: str, **kwargs
    ) -> TTaskProvider:
        """Get the page OCR provider."""
        from docprompt.tasks.ocr.gcp import GoogleOcrProvider

        return GoogleOcrProvider(
            project_id, processor_id, invoke_kwargs=self._credentials.kwargs, **kwargs
        )

get_page_ocr_provider(project_id, processor_id, **kwargs)

Get the page OCR provider.

Source code in docprompt/tasks/factory.py
def get_page_ocr_provider(
    self, project_id: str, processor_id: str, **kwargs
) -> TTaskProvider:
    """Get the page OCR provider."""
    from docprompt.tasks.ocr.gcp import GoogleOcrProvider

    return GoogleOcrProvider(
        project_id, processor_id, invoke_kwargs=self._credentials.kwargs, **kwargs
    )

PageClassificationMixin

Bases: AbstractTaskMixin, Generic[TTaskProvider]

Mixin for page classification task.

Source code in docprompt/tasks/factory.py
class PageClassificationMixin(AbstractTaskMixin, Generic[TTaskProvider]):
    """Mixin for page classification task."""

    tags = [PageLevelCapabilities.PAGE_CLASSIFICATION]

    @abstractmethod
    def get_page_classification_provider(self, *args, **kwargs) -> TTaskProvider:
        """Perform page classification."""

get_page_classification_provider(*args, **kwargs) abstractmethod

Perform page classification.

Source code in docprompt/tasks/factory.py
@abstractmethod
def get_page_classification_provider(self, *args, **kwargs) -> TTaskProvider:
    """Perform page classification."""

PageMarkerizationMixin

Bases: AbstractTaskMixin, Generic[TTaskProvider]

Mixin for page markerization task.

Source code in docprompt/tasks/factory.py
class PageMarkerizationMixin(AbstractTaskMixin, Generic[TTaskProvider]):
    """Mixin for page markerization task."""

    tags = [PageLevelCapabilities.PAGE_MARKERIZATION]

    @abstractmethod
    def get_page_markerization_provider(self, *args, **kwargs) -> TTaskProvider:
        """Perform page markerization."""

get_page_markerization_provider(*args, **kwargs) abstractmethod

Perform page markerization.

Source code in docprompt/tasks/factory.py
@abstractmethod
def get_page_markerization_provider(self, *args, **kwargs) -> TTaskProvider:
    """Perform page markerization."""

PageOCRMixin

Bases: AbstractTaskMixin, Generic[TTaskProvider]

Mixin for page OCR task.

Source code in docprompt/tasks/factory.py
class PageOCRMixin(AbstractTaskMixin, Generic[TTaskProvider]):
    """Mixin for page OCR task."""

    tags = [PageLevelCapabilities.PAGE_LAYOUT_OCR, PageLevelCapabilities.PAGE_TEXT_OCR]

    @abstractmethod
    def get_page_ocr_provider(self, *args, **kwargs) -> TTaskProvider:
        """Perform OCR on a page."""

get_page_ocr_provider(*args, **kwargs) abstractmethod

Perform OCR on a page.

Source code in docprompt/tasks/factory.py
@abstractmethod
def get_page_ocr_provider(self, *args, **kwargs) -> TTaskProvider:
    """Perform OCR on a page."""

PageRasterizationMixin

Bases: AbstractTaskMixin, Generic[TTaskProvider]

Mixin for page rasterization task.

Source code in docprompt/tasks/factory.py
class PageRasterizationMixin(AbstractTaskMixin, Generic[TTaskProvider]):
    """Mixin for page rasterization task."""

    tags = [PageLevelCapabilities.PAGE_RASTERIZATION]

    @abstractmethod
    def get_rasterize_page_provider(self, **kwargs) -> TTaskProvider:
        """Perform page rasterization."""

get_rasterize_page_provider(**kwargs) abstractmethod

Perform page rasterization.

Source code in docprompt/tasks/factory.py
@abstractmethod
def get_rasterize_page_provider(self, **kwargs) -> TTaskProvider:
    """Perform page rasterization."""

PageSegmentationMixin

Bases: AbstractTaskMixin, Generic[TTaskProvider]

Mixin for page segmentation task.

Source code in docprompt/tasks/factory.py
class PageSegmentationMixin(AbstractTaskMixin, Generic[TTaskProvider]):
    """Mixin for page segmentation task."""

    tags = [PageLevelCapabilities.PAGE_SEGMENTATION]

    @abstractmethod
    def get_page_segmentation_provider(self, *args, **kwargs) -> TTaskProvider:
        """Perform page segmentation."""

get_page_segmentation_provider(*args, **kwargs) abstractmethod

Perform page segmentation.

Source code in docprompt/tasks/factory.py
@abstractmethod
def get_page_segmentation_provider(self, *args, **kwargs) -> TTaskProvider:
    """Perform page segmentation."""

PageTableExtractionMixin

Bases: AbstractTaskMixin, Generic[TTaskProvider]

Mixin for page table extraction task.

Source code in docprompt/tasks/factory.py
class PageTableExtractionMixin(AbstractTaskMixin, Generic[TTaskProvider]):
    """Mixin for page table extraction task."""

    tags = [PageLevelCapabilities.PAGE_TABLE_EXTRACTION]

    @abstractmethod
    def get_page_table_extraction_provider(self, *args, **kwargs) -> TTaskProvider:
        """Extract tables from a page."""

get_page_table_extraction_provider(*args, **kwargs) abstractmethod

Extract tables from a page.

Source code in docprompt/tasks/factory.py
@abstractmethod
def get_page_table_extraction_provider(self, *args, **kwargs) -> TTaskProvider:
    """Extract tables from a page."""

PageTableIdentificationMixin

Bases: AbstractTaskMixin, Generic[TTaskProvider]

Mixin for page table identification task.

Source code in docprompt/tasks/factory.py
class PageTableIdentificationMixin(AbstractTaskMixin, Generic[TTaskProvider]):
    """Mixin for page table identification task."""

    tags = [PageLevelCapabilities.PAGE_TABLE_IDENTIFICATION]

    @abstractmethod
    def get_page_table_identification_provider(self, *args, **kwargs) -> TTaskProvider:
        """Perform page table identification."""

get_page_table_identification_provider(*args, **kwargs) abstractmethod

Perform page table identification.

Source code in docprompt/tasks/factory.py
@abstractmethod
def get_page_table_identification_provider(self, *args, **kwargs) -> TTaskProvider:
    """Perform page table identification."""

PageVQAMixin

Bases: AbstractTaskMixin, Generic[TTaskProvider]

Mixin for page VQA task.

Source code in docprompt/tasks/factory.py
class PageVQAMixin(AbstractTaskMixin, Generic[TTaskProvider]):
    """Mixin for page VQA task."""

    tags = [PageLevelCapabilities.PAGE_VQA]

    @abstractmethod
    def get_page_vqa_provider(self, *args, **kwargs) -> TTaskProvider:
        """Perform page VQA."""

get_page_vqa_provider(*args, **kwargs) abstractmethod

Perform page VQA.

Source code in docprompt/tasks/factory.py
@abstractmethod
def get_page_vqa_provider(self, *args, **kwargs) -> TTaskProvider:
    """Perform page VQA."""

markerize

anthropic

base

message

The core primatives for any language model interfacing. Docprompt uses these for the prompt garden, but supports free conversion to and from these types from other libaries.

OpenAIMessage

Bases: BaseModel

Source code in docprompt/tasks/message.py
class OpenAIMessage(BaseModel):
    role: Literal["system", "user", "assistant"]
    content: Union[str, List[OpenAIComplexContent]]

    def to_langchain_message(self):
        try:
            from langchain.schema import AIMessage, HumanMessage, SystemMessage
        except ImportError:
            raise ImportError(
                "Could not import langchain.schema. Install with `docprompt[langchain]`"
            )

        role_mapping = {
            "system": SystemMessage,
            "user": HumanMessage,
            "assistant": AIMessage,
        }

        dumped = self.model_dump(mode="json", exclude_unset=True, exclude_none=True)

        return role_mapping[self.role](content=dumped["content"])

    def to_openai(self):
        return self.model_dump(mode="json", exclude_unset=True, exclude_none=True)

    def to_llamaindex_chat_message(self):
        try:
            from llama_index.core.base.llms.types import ChatMessage, MessageRole
        except ImportError:
            raise ImportError(
                "Could not import llama_index.core. Install with `docprompt[llamaindex]`"
            )

        role_mapping = {
            "system": MessageRole.SYSTEM,
            "user": MessageRole.USER,
            "assistant": MessageRole.ASSISTANT,
        }

        dumped = self.model_dump(mode="json", exclude_unset=True, exclude_none=True)

        return ChatMessage.from_str(
            content=dumped["content"], role=role_mapping[self.role]
        )

    @classmethod
    def from_image_uri(cls, image_uri: str) -> "OpenAIMessage":
        """Create an image message from a URI.

        Args:
            role: The role of the message.
            image_uri: The URI of the image.
        """
        image_url = OpenAIImageURL(url=image_uri)
        content = OpenAIComplexContent(type="image_url", image_url=image_url)
        message = cls(role="user", content=[content])
        return message

from_image_uri(image_uri) classmethod

Create an image message from a URI.

Parameters:

Name Type Description Default
role

The role of the message.

required
image_uri str

The URI of the image.

required
Source code in docprompt/tasks/message.py
@classmethod
def from_image_uri(cls, image_uri: str) -> "OpenAIMessage":
    """Create an image message from a URI.

    Args:
        role: The role of the message.
        image_uri: The URI of the image.
    """
    image_url = OpenAIImageURL(url=image_uri)
    content = OpenAIComplexContent(type="image_url", image_url=image_url)
    message = cls(role="user", content=[content])
    return message

ocr

amazon

base

gcp

GoogleOcrProvider

Bases: BaseOCRProvider

Source code in docprompt/tasks/ocr/gcp.py
class GoogleOcrProvider(BaseOCRProvider):
    name = "gcp_documentai"

    capabilities = [
        PageLevelCapabilities.PAGE_TEXT_OCR,
        PageLevelCapabilities.PAGE_LAYOUT_OCR,
        PageLevelCapabilities.PAGE_RASTERIZATION,
    ]

    max_bytes_per_request: ClassVar[int] = (
        1024 * 1024 * 20
    )  # 20MB is the max size for a single sync request
    max_page_count: ClassVar[int] = 15

    project_id: str = Field(...)
    processor_id: str = Field(...)

    service_account_info: Optional[Dict[str, str]] = Field(None)
    service_account_file: Optional[str] = Field(None)
    location: str = Field("us")
    max_workers: int = Field(multiprocessing.cpu_count() * 2)
    exclude_bounding_poly: bool = Field(False)
    return_images: bool = Field(False)
    return_image_quality_scores: bool = Field(False)

    _documentai: "documentai.DocumentProcessorServiceClient" = PrivateAttr()

    def __init__(
        self,
        project_id: str,
        processor_id: str,
        **kwargs,
    ):
        super().__init__(project_id=project_id, processor_id=processor_id, **kwargs)

        self.service_account_info = self._default_invoke_kwargs.get(
            "service_account_info", None
        )
        self.service_account_file = self._default_invoke_kwargs.get(
            "service_account_file", None
        )

        try:
            from google.cloud import documentai

            self._documentai = documentai
        except ImportError:
            raise ImportError(
                "Please install 'google-cloud-documentai' to use the GoogleCloudVisionTextExtractionProvider"
            )

    def get_documentai_client(self, client_option_kwargs: dict = {}, **kwargs):
        from google.api_core.client_options import ClientOptions

        opts = ClientOptions(
            **{
                "api_endpoint": "us-documentai.googleapis.com",
                **client_option_kwargs,
            }
        )

        base_service_client_kwargs = {
            **kwargs,
            "client_options": opts,
        }

        if self.service_account_info is not None:
            return self._documentai.DocumentProcessorServiceClient.from_service_account_info(
                info=self.service_account_info,
                **base_service_client_kwargs,
            )
        elif self.service_account_file is not None:
            with service_account_file_read_lock:
                return self._documentai.DocumentProcessorServiceClient.from_service_account_file(
                    filename=self.service_account_file,
                    **base_service_client_kwargs,
                )
        else:
            raise ValueError("Missing account info and service file path.")

    def _get_process_options(self):
        if not self.return_image_quality_scores:
            return None

        return self._documentai.ProcessOptions(
            ocr_config=self._documentai.OcrConfig(
                enable_image_quality_scores=True,
            )
        )

    def _process_document_sync(self, document: Document):
        """
        Split the document into chunks of 15 pages or less, and process each chunk
        synchronously.
        """
        client = self.get_documentai_client()
        processor_name = client.processor_path(
            project=self.project_id,
            location=self.location,
            processor=self.processor_id,
        )

        documents: List["documentai.Document"] = []

        file_bytes = document.get_bytes()

        @default_retry_decorator
        def process_byte_chunk(split_bytes: bytes) -> "documentai.Document":
            raw_document = self._documentai.RawDocument(
                content=split_bytes,
                mime_type="application/pdf",
            )

            field_mask = (
                "text,pages.layout,pages.words,pages.lines,pages.tokens,pages.blocks"
            )

            if self.return_images:
                field_mask += ",pages.image"

            if self.return_image_quality_scores:
                field_mask += ",image_quality_scores"

            request = self._documentai.ProcessRequest(
                name=processor_name,
                raw_document=raw_document,
                process_options=self._get_process_options(),
            )

            result = client.process_document(request=request)

            return result.document

        with tqdm.tqdm(
            total=len(file_bytes), unit="B", unit_scale=True, desc="Processing document"
        ) as pbar:
            for split_bytes in pdf_split_iter_with_max_bytes(
                file_bytes,
                max_page_count=self.max_page_count,
                max_bytes=self.max_bytes_per_request,
            ):
                document = process_byte_chunk(split_bytes)

                documents.append(document)

                pbar.update(len(split_bytes))

        return gcp_documents_to_result(
            documents,
            self.name,
            document_name=document.name,
            file_hash=document.document_hash,
            exclude_bounding_poly=self.exclude_bounding_poly,
            return_images=self.return_images,
        )

    def _process_document_concurrent(
        self,
        document: Document,
        start: Optional[int] = None,
        stop: Optional[int] = None,
        include_raster: bool = False,
    ):
        # Process page chunks concurrently
        client = self.get_documentai_client()
        processor_name = client.processor_path(
            project=self.project_id,
            location=self.location,
            processor=self.processor_id,
        )

        file_bytes = document.file_bytes

        if document.bytes_per_page > 1024 * 1024 * 2:
            logger.info("Document has few pages but is large, compressing first")
            file_bytes = document.to_compressed_bytes()

        logger.info("Splitting document into chunks...")
        document_byte_splits = list(
            pdf_split_iter_with_max_bytes(
                file_bytes,
                max_page_count=self.max_page_count,
                max_bytes=self.max_bytes_per_request,
            )
        )

        max_workers = min(len(document_byte_splits), self.max_workers)

        @default_retry_decorator
        def process_byte_chunk(split_bytes: bytes):
            raw_document = self._documentai.RawDocument(
                content=split_bytes,
                mime_type="application/pdf",
            )

            field_mask = (
                "text,pages.layout,pages.words,pages.lines,pages.tokens,pages.blocks"
            )

            if self.return_images:
                field_mask += ",pages.image"

            if self.return_image_quality_scores:
                field_mask += ",image_quality_scores"

            request = self._documentai.ProcessRequest(
                name=processor_name,
                raw_document=raw_document,
                process_options=self._get_process_options(),
            )

            result = client.process_document(request=request)

            document = result.document

            return document

        logger.info(f"Processing {len(document_byte_splits)} chunks...")
        with tqdm.tqdm(
            total=len(document_byte_splits), desc="Processing document"
        ) as pbar:
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                future_to_index = {
                    executor.submit(process_byte_chunk, split): index
                    for index, split in enumerate(document_byte_splits)
                }

                documents: List["documentai.Document"] = [None] * len(
                    document_byte_splits
                )  # type: ignore

                for future in as_completed(future_to_index):
                    index = future_to_index[future]
                    documents[index] = future.result()
                    pbar.update(1)

        logger.info("Recombining OCR results...")
        return gcp_documents_to_result(
            documents,
            self.name,
            document_name=document.name,
            file_hash=document.document_hash,
            exclude_bounding_poly=self.exclude_bounding_poly,
            return_images=self.return_images,
        )

    def _invoke(
        self,
        input: List[PdfDocument],
        config: None = None,
        start: Optional[int] = None,
        stop: Optional[int] = None,
        **kwargs,
    ):
        if len(input) != 1:
            raise ValueError(
                "GoogleOcrProvider only supports processing a single document at a time."
            )

        return self._process_document_concurrent(input[0], start=start, stop=stop)

    def process_document_node(
        self,
        document_node: "DocumentNode",
        task_config: None = None,
        start: Optional[int] = None,
        stop: Optional[int] = None,
        contribute_to_document: bool = True,
        **kwargs,
    ) -> Dict[int, OcrPageResult]:
        base_result = self.invoke(
            [document_node.document.file_bytes], start=start, stop=stop, **kwargs
        )

        # For OCR, we also need to populate the ocr_results for powered search
        self._populate_ocr_results(document_node, base_result)

        return base_result

text_from_layout(layout, document_text, offset=0)

Offset is used to account for the fact that text references are relative to the entire document.

Source code in docprompt/tasks/ocr/gcp.py
def text_from_layout(
    layout: Union["documentai.Document.Page.Layout", "documentai.Document.Page.Token"],
    document_text: str,
    offset: int = 0,
) -> str:
    """
    Offset is used to account for the fact that text references
    are relative to the entire document.
    """
    working_text = ""

    for segment in sorted(layout.text_anchor.text_segments, key=lambda x: x.end_index):
        start = getattr(segment, "start_index", 0)
        end = segment.end_index

        working_text += document_text[start - offset : end - offset]

    return working_text

result

tesseract

parser

The base output parser that seeks to mimic the langhain implementation.

BaseOutputParser

Bases: BaseModel, Generic[TTaskInput, TTaskOutput]

The output parser for the page classification system.

Source code in docprompt/tasks/parser.py
class BaseOutputParser(BaseModel, Generic[TTaskInput, TTaskOutput]):
    """The output parser for the page classification system."""

    @abstractmethod
    def from_task_input(
        cls, task_input: TTaskInput
    ) -> "BaseOutputParser[TTaskInput, TTaskOutput]":
        """Create an output parser from the task input."""

    @abstractmethod
    def parse(self, text: str) -> TTaskOutput:
        """Parse the results of the classification task."""

from_task_input(task_input) abstractmethod

Create an output parser from the task input.

Source code in docprompt/tasks/parser.py
@abstractmethod
def from_task_input(
    cls, task_input: TTaskInput
) -> "BaseOutputParser[TTaskInput, TTaskOutput]":
    """Create an output parser from the task input."""

parse(text) abstractmethod

Parse the results of the classification task.

Source code in docprompt/tasks/parser.py
@abstractmethod
def parse(self, text: str) -> TTaskOutput:
    """Parse the results of the classification task."""

result

BaseResult

Bases: BaseModel

Source code in docprompt/tasks/result.py
class BaseResult(BaseModel):
    provider_name: str = Field(
        description="The name of the provider which produced the result"
    )
    when: datetime = Field(
        default_factory=datetime.now, description="The time the result was produced"
    )

    task_name: ClassVar[str]

    @property
    def task_key(self):
        return f"{self.provider_name}_{self.task_name}"

    @abstractmethod
    def contribute_to_document_node(
        self, document_node: "DocumentNode", **kwargs
    ) -> None:
        """
        Contribute this task result to the document node or a specific page node.

        :param document_node: The DocumentNode to contribute to
        :param page_number: If provided, contribute to a specific page. If None, contribute to the document.
        """

contribute_to_document_node(document_node, **kwargs) abstractmethod

Contribute this task result to the document node or a specific page node.

:param document_node: The DocumentNode to contribute to :param page_number: If provided, contribute to a specific page. If None, contribute to the document.

Source code in docprompt/tasks/result.py
@abstractmethod
def contribute_to_document_node(
    self, document_node: "DocumentNode", **kwargs
) -> None:
    """
    Contribute this task result to the document node or a specific page node.

    :param document_node: The DocumentNode to contribute to
    :param page_number: If provided, contribute to a specific page. If None, contribute to the document.
    """

ResultContainer

Bases: BaseModel, Generic[PageOrDocumentTaskResult]

Represents a container for results of a task

Source code in docprompt/tasks/result.py
class ResultContainer(BaseModel, Generic[PageOrDocumentTaskResult]):
    """
    Represents a container for results of a task
    """

    results: Dict[str, PageOrDocumentTaskResult] = Field(
        description="The results of the task, keyed by provider", default_factory=dict
    )

    @property
    def result(self):
        return next(iter(self.results.values()), None)

table_extraction

anthropic

base

schema