prompt = PromptTemplate.from_template("""
Parse the following email text and extract the information in a structured format.
Return the result as a Python dictionary, without any additional text or formatting
Email text:
{email_text}
Extract the following information:
- Key 1
- Key 2
- Key 3
- Key 4
Format the dates as YYYY-MM-DD HH:MM.
""")
parse_chain = (
{"email_text": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
def parse_llm_output(output: str) -> Dict:
try:
clean_output = output.strip().replace("ID_of_data", "").strip()
return ast.literal_eval(clean_output)
except (ValueError, SyntaxError):
pass
try:
return json.loads(output)
except json.JSONDecodeError:
pass
try:
items = output.strip('{}').split(',')
result = {}
for item in items:
key, value = item.split(':', 1)
key = key.strip().strip('"')
value = value.strip().strip('"')
result[key] = value
return result
except Exception as e:
print(f"Error parsing LLM output: {e}")
print(f"Problematic output: {output}")
return {}
def process_pdf_and_parse_email(pdf_path: str) -> dict:
email_text = extract_text_from_pdf(pdf_path)
parsed_output = parse_chain.invoke(email_text[0].page_content)
return parse_llm_output(parsed_output)
process_pdf_and_parse_email(data)