Skip to content

date_extraction

extract_dates_from_text(input_string, *, date_formats=default_date_formats)

Extract dates from a string using a set of predefined regex patterns.

Returns a list of tuples, where the first element is the date object and the second is the full date string.

Source code in docprompt/utils/date_extraction.py
def extract_dates_from_text(
    input_string: str, *, date_formats: DateFormatsType = default_date_formats
) -> List[Tuple[date, str]]:
    """
    Extract dates from a string using a set of predefined regex patterns.

    Returns a list of tuples, where the first element is the date object and the second is the full date string.
    """
    extracted_dates = []

    for regex, date_format in date_formats:
        matches = regex.findall(input_string)

        for match_obj in matches:
            # Extract the full date from the match
            full_date = match_obj[0]  # First group captures the entire date

            if "%d" in date_format:
                parse_date = re.sub(r"(st|nd|rd|th)", "", full_date)
            else:
                parse_date = full_date

            parse_date = re.sub(r"\s+", " ", parse_date).strip()
            parse_date = re.sub(
                r"\s{1,},", ",", parse_date
            ).strip()  # Commas shouldnt have spaces before them

            # Convert to datetime object
            try:
                date_obj = datetime.strptime(parse_date, date_format)
            except ValueError as e:
                print(f"Error parsing date '{full_date}': {e}")
                continue

            extracted_dates.append((date_obj.date(), full_date))

    return extracted_dates