Extract dates from a string using a set of predefined regex patterns.
Returns a list of tuples, where the first element is the date object and the second is the full date string.
Source code in docprompt/utils/date_extraction.py
| def extract_dates_from_text(
input_string: str, *, date_formats: DateFormatsType = default_date_formats
) -> List[Tuple[date, str]]:
"""
Extract dates from a string using a set of predefined regex patterns.
Returns a list of tuples, where the first element is the date object and the second is the full date string.
"""
extracted_dates = []
for regex, date_format in date_formats:
matches = regex.findall(input_string)
for match_obj in matches:
# Extract the full date from the match
full_date = match_obj[0] # First group captures the entire date
if "%d" in date_format:
parse_date = re.sub(r"(st|nd|rd|th)", "", full_date)
else:
parse_date = full_date
parse_date = re.sub(r"\s+", " ", parse_date).strip()
parse_date = re.sub(
r"\s{1,},", ",", parse_date
).strip() # Commas shouldnt have spaces before them
# Convert to datetime object
try:
date_obj = datetime.strptime(parse_date, date_format)
except ValueError as e:
print(f"Error parsing date '{full_date}': {e}")
continue
extracted_dates.append((date_obj.date(), full_date))
return extracted_dates
|