E-mail(PDF) LLM Parsing

Kyung Pyo Ham·2024년 7월 23일
0

LLM Agent Project

목록 보기
3/3
prompt = PromptTemplate.from_template("""
Parse the following email text and extract the information in a structured format.
Return the result as a Python dictionary, without any additional text or formatting

Email text:
{email_text}

Extract the following information:
- Key 1
- Key 2
- Key 3
- Key 4

Format the dates as YYYY-MM-DD HH:MM.
""")


# Create the chain
parse_chain = (
    {"email_text": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

def parse_llm_output(output: str) -> Dict:
    # First, try to parse as a Python literal
    try:
        # Remove any leading/trailing whitespace and the "booking_info =" part if present
        clean_output = output.strip().replace("ID_of_data", "").strip()
        return ast.literal_eval(clean_output)
    except (ValueError, SyntaxError):
        pass
    
    # If that fails, try to parse as JSON
    try:
        return json.loads(output)
    except json.JSONDecodeError:
        pass
    
    # If both methods fail, try to manually parse the string
    try:
        # Remove curly braces and split by commas
        items = output.strip('{}').split(',')
        result = {}
        for item in items:
            key, value = item.split(':', 1)
            key = key.strip().strip('"')  # Remove quotes and whitespace
            value = value.strip().strip('"')  # Remove quotes and whitespace
            result[key] = value
        return result
    except Exception as e:
        print(f"Error parsing LLM output: {e}")
        print(f"Problematic output: {output}")
        return {}
        
def process_pdf_and_parse_email(pdf_path: str) -> dict:
    email_text = extract_text_from_pdf(pdf_path)
    parsed_output = parse_chain.invoke(email_text[0].page_content)
    return parse_llm_output(parsed_output)
    
    
process_pdf_and_parse_email(data)
profile
Data Scientist | AI Engineer

0개의 댓글