Index

`base`

`AbstractDocumentTaskProvider`

Bases: AbstractTaskProvider

A task provider performs a specific, repeatable task on a document.

Source code in docprompt/tasks/base.py

class AbstractDocumentTaskProvider(AbstractTaskProvider):
    """
    A task provider performs a specific, repeatable task on a document.
    """

    capabilities: ClassVar[List[DocumentLevelCapabilities]]

    # NOTE: We need the stubs defined here for the flexible decorators to work
    # for now

    class Meta:
        abstract = True

`AbstractPageTaskProvider`

Bases: AbstractTaskProvider

A page task provider performs a specific, repeatable task on a page.

Source code in docprompt/tasks/base.py

class AbstractPageTaskProvider(AbstractTaskProvider):
    """
    A page task provider performs a specific, repeatable task on a page.
    """

    capabilities: ClassVar[List[PageLevelCapabilities]]

    # NOTE: We need the stubs defined here for the flexible decorators to work
    # for now

    class Meta:
        abstract = True

`AbstractTaskProvider`

Bases: BaseModel, Generic[TTaskInput, TTaskConfig, TTaskResult]

A task provider performs a specific, repeatable task on a document or its pages.

NOTE: Either the process_document_pages or aprocess_document_pages method must be implemented in a valid subclass. The process_document_pages method is explicitly defined, while the aprocess_document_pages method is an async version of the same method.

If you wish to provide seperate implementations for sync and async, you can define both methods individually, and they will each use their own custom implementation when called. Otherwise, if you only implement one or the other of a flexible method pair, the other will automatically be generated and provided for you at runtime.

Source code in docprompt/tasks/base.py

@flexible_methods(
    ("process_document_node", "aprocess_document_node"),
    ("_invoke", "_ainvoke"),
)
class AbstractTaskProvider(BaseModel, Generic[TTaskInput, TTaskConfig, TTaskResult]):
    """
    A task provider performs a specific, repeatable task on a document or its pages.

    NOTE: Either the `process_document_pages` or `aprocess_document_pages` method must be implemented in
    a valid subclass. The `process_document_pages` method is explicitly defined, while the `aprocess_document_pages`
    method is an async version of the same method.

    If you wish to provide seperate implementations for sync and async, you can define both methods individually, and
    they will each use their own custom implementation when called. Otherwise, if you only implement one or the other of
    a flexible method pair, the other will automatically be generated and provided for you at runtime.
    """

    name: ClassVar[str]
    capabilities: ClassVar[List[Capabilites]]

    # TODO: Potentially utilize context here during instantiation from Factory??
    _default_invoke_kwargs: Dict[str, str] = PrivateAttr()

    class Meta:
        """The meta class is utilized by the flexible methods decorator.

        For all classes that are not concrete implementations, we should set the
        abstract attribute to True, which will prevent the check from failing when
        the flexible methods decorator is looking for the implementation of the
        methods.
        """

        abstract = True

    def __init__(self, invoke_kwargs: Dict[str, str] = None, **data):
        with init_context({"invoke_kwargs": invoke_kwargs or {}}):
            self.__pydantic_validator__.validate_python(
                data,
                self_instance=self,
                context=_init_context_var.get(),
            )

    @model_validator(mode="before")
    @classmethod
    def validate_class_vars(cls, data: Any) -> Any:
        """
        Ensure that the class has a name and capabilities defined.
        """

        if not hasattr(cls, "name"):
            raise ValueError("Task providers must have a name defined")

        if not hasattr(cls, "capabilities"):
            raise ValueError("Task providers must have capabilities defined")

        if not cls.capabilities:
            raise ValueError("Task providers must have at least one capability defined")

        return data

    @model_validator(mode="after")
    def set_invoke_kwargs(self, info: ValidationInfo) -> Self:
        """
        Set the default invoke kwargs for the task provider.
        """
        self._default_invoke_kwargs = info.context["invoke_kwargs"]
        return self

    async def _ainvoke(
        self,
        input: Iterable[TTaskInput],
        config: Optional[TTaskConfig] = None,
        **kwargs,
    ) -> List[TTaskResult]:
        raise NotImplementedError

    async def ainvoke(
        self,
        input: Iterable[TTaskInput],
        config: Optional[TTaskConfig] = None,
        **kwargs,
    ) -> List[TTaskResult]:
        invoke_kwargs = {
            **self._default_invoke_kwargs,
            **kwargs,
        }

        return await self._ainvoke(input, config, **invoke_kwargs)

    def _invoke(
        self,
        input: Iterable[TTaskInput],
        config: Optional[TTaskConfig] = None,
        **kwargs,
    ) -> List[TTaskResult]:
        raise NotImplementedError

    def invoke(
        self,
        input: Iterable[TTaskInput],
        config: Optional[TTaskConfig] = None,
        **kwargs,
    ) -> List[TTaskResult]:
        invoke_kwargs = {
            **self._default_invoke_kwargs,
            **kwargs,
        }

        return self._invoke(input, config, **invoke_kwargs)

    def process_document_node(
        self,
        document_node: "DocumentNode",
        task_config: Optional[TTaskConfig] = None,
        start: Optional[int] = None,
        stop: Optional[int] = None,
        contribute_to_document: bool = True,
        **kwargs,
    ) -> Dict[int, TTaskResult]:
        raise NotImplementedError

    async def aprocess_document_node(
        self,
        document_node: "DocumentNode",
        task_config: Optional[TTaskConfig] = None,
        start: Optional[int] = None,
        stop: Optional[int] = None,
        contribute_to_document: bool = True,
        **kwargs,
    ) -> Dict[int, TTaskResult]:
        raise NotImplementedError

`Meta`

The meta class is utilized by the flexible methods decorator.

For all classes that are not concrete implementations, we should set the abstract attribute to True, which will prevent the check from failing when the flexible methods decorator is looking for the implementation of the methods.

Source code in docprompt/tasks/base.py

class Meta:
    """The meta class is utilized by the flexible methods decorator.

    For all classes that are not concrete implementations, we should set the
    abstract attribute to True, which will prevent the check from failing when
    the flexible methods decorator is looking for the implementation of the
    methods.
    """

    abstract = True

`set_invoke_kwargs(info)`

Set the default invoke kwargs for the task provider.

Source code in docprompt/tasks/base.py

@model_validator(mode="after")
def set_invoke_kwargs(self, info: ValidationInfo) -> Self:
    """
    Set the default invoke kwargs for the task provider.
    """
    self._default_invoke_kwargs = info.context["invoke_kwargs"]
    return self

`validate_class_vars(data)` `classmethod`

Ensure that the class has a name and capabilities defined.

Source code in docprompt/tasks/base.py

@model_validator(mode="before")
@classmethod
def validate_class_vars(cls, data: Any) -> Any:
    """
    Ensure that the class has a name and capabilities defined.
    """

    if not hasattr(cls, "name"):
        raise ValueError("Task providers must have a name defined")

    if not hasattr(cls, "capabilities"):
        raise ValueError("Task providers must have capabilities defined")

    if not cls.capabilities:
        raise ValueError("Task providers must have at least one capability defined")

    return data

`capabilities`

`PageLevelCapabilities`

Bases: str, Enum

Represents a capability that a provider can fulfill

Source code in docprompt/tasks/capabilities.py

class PageLevelCapabilities(str, Enum):
    """
    Represents a capability that a provider can fulfill
    """

    PAGE_RASTERIZATION = "page-rasterization"
    PAGE_LAYOUT_OCR = "page-layout-ocr"
    PAGE_TEXT_OCR = "page-text-ocr"
    PAGE_CLASSIFICATION = "page-classification"
    PAGE_MARKERIZATION = "page-markerization"
    PAGE_SEGMENTATION = "page-segmentation"
    PAGE_VQA = "page-vqa"
    PAGE_TABLE_IDENTIFICATION = "page-table-identification"
    PAGE_TABLE_EXTRACTION = "page-table-extraction"

`classification`

`anthropic`

The antrhopic implementation of page level calssification.

`AnthropicClassificationProvider`

Bases: BaseClassificationProvider

The Anthropic implementation of unscored page classification.

Source code in docprompt/tasks/classification/anthropic.py

class AnthropicClassificationProvider(BaseClassificationProvider):
    """The Anthropic implementation of unscored page classification."""

    name = "anthropic"

    async def _ainvoke(
        self, input: Iterable[bytes], config: ClassificationConfig = None, **kwargs
    ) -> List[ClassificationOutput]:
        messages = _prepare_messages(input, config)

        parser = AnthropicPageClassificationOutputParser.from_task_input(
            config, provider_name=self.name
        )

        completions = await inference.run_batch_inference_anthropic(messages)

        return [parser.parse(res) for res in completions]

`AnthropicPageClassificationOutputParser`

Bases: BasePageClassificationOutputParser

The output parser for the page classification system.

Source code in docprompt/tasks/classification/anthropic.py

class AnthropicPageClassificationOutputParser(BasePageClassificationOutputParser):
    """The output parser for the page classification system."""

    def parse(self, text: str) -> ClassificationOutput:
        """Parse the results of the classification task."""
        pattern = re.compile(r"Answer: (.+)")
        match = pattern.search(text)

        result = self.resolve_match(match)

        if self.confidence:
            conf_pattern = re.compile(r"Confidence: (.+)")
            conf_match = conf_pattern.search(text)
            conf_result = self.resolve_confidence(conf_match)

            return ClassificationOutput(
                type=self.type,
                labels=result,
                score=conf_result,
                provider_name=self.name,
            )

        return ClassificationOutput(
            type=self.type, labels=result, provider_name=self.name
        )

`parse(text)`

Parse the results of the classification task.

Source code in docprompt/tasks/classification/anthropic.py

def parse(self, text: str) -> ClassificationOutput:
    """Parse the results of the classification task."""
    pattern = re.compile(r"Answer: (.+)")
    match = pattern.search(text)

    result = self.resolve_match(match)

    if self.confidence:
        conf_pattern = re.compile(r"Confidence: (.+)")
        conf_match = conf_pattern.search(text)
        conf_result = self.resolve_confidence(conf_match)

        return ClassificationOutput(
            type=self.type,
            labels=result,
            score=conf_result,
            provider_name=self.name,
        )

    return ClassificationOutput(
        type=self.type, labels=result, provider_name=self.name
    )

`base`

`BaseClassificationProvider`

Bases: AbstractPageTaskProvider[bytes, ClassificationConfig, ClassificationOutput]

The base classification provider.

Source code in docprompt/tasks/classification/base.py

class BaseClassificationProvider(
    AbstractPageTaskProvider[bytes, ClassificationConfig, ClassificationOutput]
):
    """
    The base classification provider.
    """

    capabilities = [PageLevelCapabilities.PAGE_CLASSIFICATION]

    class Meta:
        abstract = True

    def process_document_node(
        self,
        document_node: "DocumentNode",
        task_config: ClassificationConfig = None,
        start: Optional[int] = None,
        stop: Optional[int] = None,
        contribute_to_document: bool = True,
        **kwargs,
    ):
        assert (
            task_config is not None
        ), "task_config must be provided for classification tasks"

        raster_bytes = []
        for page_number in range(start or 1, (stop or len(document_node)) + 1):
            image_bytes = document_node.page_nodes[
                page_number - 1
            ].rasterizer.rasterize("default")
            raster_bytes.append(image_bytes)

        # TODO: This is a somewhat dangerous way of requiring these kwargs to be drilled
        # through, potentially a decorator solution to be had here
        kwargs = {**self._default_invoke_kwargs, **kwargs}
        results = self._invoke(raster_bytes, config=task_config, **kwargs)

        return {
            i: res
            for i, res in zip(
                range(start or 1, (stop or len(document_node)) + 1), results
            )
        }

`BasePageClassificationOutputParser`

Bases: ABC, BaseOutputParser[ClassificationConfig, ClassificationOutput]

The output parser for the page classification system.

Source code in docprompt/tasks/classification/base.py

class BasePageClassificationOutputParser(
    ABC, BaseOutputParser[ClassificationConfig, ClassificationOutput]
):
    """The output parser for the page classification system."""

    name: str = Field(...)
    type: ClassificationTypes = Field(...)
    labels: LabelType = Field(...)
    confidence: bool = Field(False)

    @classmethod
    def from_task_input(cls, task_input: ClassificationConfig, provider_name: str):
        return cls(
            type=task_input.type,
            name=provider_name,
            labels=task_input.labels,
            confidence=task_input.confidence,
        )

    def resolve_match(self, _match: Union[re.Match, None]) -> LabelType:
        """Get the regex pattern for the output parser."""

        if not _match:
            raise ValueError("Could not find the answer in the text.")

        val = _match.group(1)
        if self.type == ClassificationTypes.BINARY:
            if val not in self.labels:
                raise ValueError(f"Invalid label: {val}")
            return val

        elif self.type == ClassificationTypes.SINGLE_LABEL:
            if val not in self.labels:
                raise ValueError(f"Invalid label: {val}")
            return val

        elif self.type == ClassificationTypes.MULTI_LABEL:
            labels = val.split(", ")
            for label in labels:
                if label not in self.labels:
                    raise ValueError(f"Invalid label: {label}")
            return labels
        else:
            raise ValueError(f"Invalid classification type: {self.type}")

    def resolve_confidence(self, _match: Union[re.Match, None]) -> ConfidenceLevel:
        """Get the confidence level from the text."""

        if not _match:
            return None

        val = _match.group(1).lower()

        return ConfidenceLevel(val)

    @abstractmethod
    def parse(self, text: str) -> ClassificationOutput: ...

`resolve_confidence(_match)`

Get the confidence level from the text.

Source code in docprompt/tasks/classification/base.py

def resolve_confidence(self, _match: Union[re.Match, None]) -> ConfidenceLevel:
    """Get the confidence level from the text."""

    if not _match:
        return None

    val = _match.group(1).lower()

    return ConfidenceLevel(val)

`resolve_match(_match)`

Get the regex pattern for the output parser.

Source code in docprompt/tasks/classification/base.py

def resolve_match(self, _match: Union[re.Match, None]) -> LabelType:
    """Get the regex pattern for the output parser."""

    if not _match:
        raise ValueError("Could not find the answer in the text.")

    val = _match.group(1)
    if self.type == ClassificationTypes.BINARY:
        if val not in self.labels:
            raise ValueError(f"Invalid label: {val}")
        return val

    elif self.type == ClassificationTypes.SINGLE_LABEL:
        if val not in self.labels:
            raise ValueError(f"Invalid label: {val}")
        return val

    elif self.type == ClassificationTypes.MULTI_LABEL:
        labels = val.split(", ")
        for label in labels:
            if label not in self.labels:
                raise ValueError(f"Invalid label: {label}")
        return labels
    else:
        raise ValueError(f"Invalid classification type: {self.type}")

`ClassificationConfig`

Bases: BaseModel

Source code in docprompt/tasks/classification/base.py

class ClassificationConfig(BaseModel):
    type: ClassificationTypes
    labels: LabelType
    descriptions: Optional[List[str]] = Field(
        None, description="The descriptions for each label (if any)."
    )

    instructions: Optional[str] = Field(
        None,
        description="Additional instructions to pass to the LLM for the task. Required for Binary Classification.",
    )

    confidence: bool = Field(False)

    @model_validator(mode="before")
    def validate_label_bindings(cls, data: Any) -> Any:
        """Validate the the label/description bindings based on the type."""

        classification_type = data.get("type", None)
        if classification_type == ClassificationTypes.SINGLE_LABEL:
            labels = data.get("labels", None)
            if not labels:
                raise ValueError(
                    "labels must be provided for single_label classification"
                )
            return data

        elif classification_type == ClassificationTypes.BINARY:
            instructions = data.get("instructions", None)
            if not instructions:
                raise ValueError(
                    "instructions must be provided for binary classification"
                )
            data["labels"] = ["YES", "NO"]
            return data

        elif classification_type == ClassificationTypes.MULTI_LABEL:
            labels = data.get("labels", None)
            if not labels:
                raise ValueError(
                    "labels must be provided for multi_label classification"
                )
            return data

    @model_validator(mode="after")
    def validate_descriptions_length(self):
        if self.descriptions is not None:
            labels = self.labels
            if labels is not None and len(self.descriptions) != len(labels):
                raise ValueError("descriptions must have the same length as labels")
        return self

    @property
    def formatted_labels(self):
        """Produce the formatted labels for the prompt template."""
        raw_labels = self.labels
        if self.descriptions:
            for label, description in zip(raw_labels, self.descriptions):
                yield f"{label}: {description}"
        else:
            yield from raw_labels

`formatted_labels` `property`

Produce the formatted labels for the prompt template.

`validate_label_bindings(data)`

Validate the the label/description bindings based on the type.

Source code in docprompt/tasks/classification/base.py

@model_validator(mode="before")
def validate_label_bindings(cls, data: Any) -> Any:
    """Validate the the label/description bindings based on the type."""

    classification_type = data.get("type", None)
    if classification_type == ClassificationTypes.SINGLE_LABEL:
        labels = data.get("labels", None)
        if not labels:
            raise ValueError(
                "labels must be provided for single_label classification"
            )
        return data

    elif classification_type == ClassificationTypes.BINARY:
        instructions = data.get("instructions", None)
        if not instructions:
            raise ValueError(
                "instructions must be provided for binary classification"
            )
        data["labels"] = ["YES", "NO"]
        return data

    elif classification_type == ClassificationTypes.MULTI_LABEL:
        labels = data.get("labels", None)
        if not labels:
            raise ValueError(
                "labels must be provided for multi_label classification"
            )
        return data

`ConfidenceLevel`

Bases: str, Enum

The confidence level of the classification.

Source code in docprompt/tasks/classification/base.py

class ConfidenceLevel(str, Enum):
    """The confidence level of the classification."""

    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"

`credentials`

The credentials module defines a simple model schema for storing credentials.

`APIKeyCredential`

Bases: BaseCredentials

The API key credential model.

Source code in docprompt/tasks/credentials.py

class APIKeyCredential(BaseCredentials):
    """The API key credential model."""

    api_key: SecretStr = Field(...)

    def __init__(self, environ_path: Optional[str] = None, **data):
        api_key = data.get("api_key", None)
        if api_key is None and environ_path:
            api_key = os.environ.get(environ_path, None)
        super().__init__(api_key=api_key)

`AWSCredentials`

Bases: BaseCredentials

The AWS credentials model.

Source code in docprompt/tasks/credentials.py

class AWSCredentials(BaseCredentials):
    """The AWS credentials model."""

    aws_access_key_id: Optional[SecretStr] = Field(None)
    aws_secret_access_key: Optional[SecretStr] = Field(None)
    aws_session_token: Optional[SecretStr] = Field(None)
    aws_region: Optional[str] = Field(None)

    def __init__(self, **data):
        aws_access_key_id = data.get(
            "aws_access_key_id", os.environ.get("AWS_ACCESS_KEY_ID", None)
        )
        aws_secret_access_key = data.get(
            "aws_secret_access_key", os.environ.get("AWS_SECRET_ACCESS_KEY", None)
        )
        aws_session_token = data.get(
            "aws_session_token", os.environ.get("AWS_SESSION_TOKEN", None)
        )
        aws_region = data.get("aws_region", os.environ.get("AWS_DEFAULT_REGION", None))
        super().__init__(
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            aws_session_token=aws_session_token,
            aws_region=aws_region,
        )

    @model_validator(mode="after")
    def _validate_aws_credentials(self) -> Self:
        """Ensure the provided AWS credentials are valid."""

        key_pair_is_set = self.aws_access_key_id and self.aws_secret_access_key

        if not key_pair_is_set and not self.aws_session_token:
            raise ValueError(
                "You must provide either an AWS session token or an access key and secret key."
            )

        if key_pair_is_set and not self.aws_region:
            raise ValueError(
                "You must provide an AWS region when using an access key and secret key."
            )

        if key_pair_is_set and self.aws_session_token:
            raise ValueError(
                "You cannot provide both an AWS session token and an access key and secret key."
            )

        return self

`BaseCredentials`

Bases: BaseModel

The base credentials model.

Source code in docprompt/tasks/credentials.py

class BaseCredentials(BaseModel):
    """The base credentials model."""

    @property
    def kwargs(self) -> Dict[str, str]:
        """Return the credentials as a dictionary with secrets exposed."""
        data = self.model_dump(exclude_none=True)
        for key, value in data.items():
            if isinstance(value, SecretStr):
                data[key] = value.get_secret_value()
        return data

`kwargs: Dict[str, str]` `property`

Return the credentials as a dictionary with secrets exposed.

`GCPServiceFileCredentials`

Bases: BaseCredentials

The GCP service account credentials model.

Source code in docprompt/tasks/credentials.py

class GCPServiceFileCredentials(BaseCredentials):
    """The GCP service account credentials model."""

    service_account_info: Optional[Dict[str, str]] = Field(None)
    service_account_file: Optional[str] = Field(None)

    def __init__(self, **data):
        service_account_info = data.get("service_account_info", None)
        service_account_file = data.get(
            "service_account_file", os.environ.get("GCP_SERVICE_ACCOUNT_FILE", None)
        )

        super().__init__(
            service_account_info=service_account_info,
            service_account_file=service_account_file,
        )

    @model_validator(mode="after")
    def _validate_gcp_credentials(self) -> Self:
        """Ensure the provided GCP credentials are valid."""
        if self.service_account_info is None and self.service_account_file is None:
            raise ValueError(
                "You must provide either service_account_info or service_account_file. You may set the `GCP_SERVICE_ACCOUNT_FILE` environment variable to the path of the service account file."
            )
        if (
            self.service_account_info is not None
            and self.service_account_file is not None
        ):
            raise ValueError(
                "You must provide either service_account_info or service_account_file, not both"
            )
        return self

`factory`

Define the base factory for creating task providers.