Generate fake email pairs for LLM finetuning¶
This notebook generates syntentic pairs of request/response emails.
Meant for llm-fine-tuning specific use cases.
In the local folder ./fake_emails
you will create lots of fake emails shuffled here with predefined names/companies.
In [ ]:
%pip install langchain
In [ ]:
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain.llms import Ollama
from langchain.chat_models import ChatOllama
from typing import List
import asyncio
import time
import random
from langchain.chains import LLMChain
promptDir = "./fake_emails"
if not os.path.exists(promptDir):
os.makedirs(promptDir)
class Email(BaseModel):
sender: str = Field(description="name of an email sender")
sender_email_content: str = Field(description="content of the senders email ")
answer_email_content: str = Field(
description="content of the answer email in a very polite tone, much longer than the sender-email, refers back to the sender."
)
n = 0
async def async_generate(i, chain, sender, receiver, field, sender_query, company):
resp = await chain.ainvoke(
{
"receiver": receiver,
"sender": sender,
"field": field,
"query": sender_query,
"company": company,
}
)
emails.append(resp)
with open(f"{promptDir}/email_with_output_parser{i + 1}.json", "w") as f:
f.write(json.dumps(resp['text'].__dict__) )
async def generate_concurrently(i):
sender_query = "Generate me email inquiry/answer pair"
model = ChatOllama(
temperature=0.9, model="mistral"
)
parser = PydanticOutputParser(pydantic_object=Email)
prompt = PromptTemplate(
template="Generate me email from '{sender}' which is reaching out to '{receiver}' with an short introduction email asking for collaboration in '{field}'. It starts always with something similar to: 'Dear {receiver}' and always ends with something like: 'Best regards\n{sender}\n---\n{company}'. The answer email is much longer and more polite. \n\n{format_instructions}\n{query}\n",
input_variables=["receiver", "sender", "query", "field", "company"],
partial_variables={"format_instructions": parser.get_format_instructions()},
)
senders = [
"Anita",
"Heinz",
"Elisa",
"Alex",
]
random.shuffle(senders)
sender = senders[0]
receivers = [
"Tommy",
"Emile",
"TBD",
]
random.shuffle(receivers)
receiver = receivers[0]
fields = [
"data-science",
"machine-learning",
"process-automation",
"software-engineering",
]
random.shuffle(fields)
field = fields[0]
companies = [
"ACME Inc.",
"Nihongo Limited",
"A better film-rental-company",
"Old-school company",
]
random.shuffle(companies)
company = companies[0]
print(
f"#{i+1} From:",
sender,
", Receiver:",
receiver,
", Field:",
field,
", Company:",
company,
)
chain = LLMChain(llm=model, prompt=prompt, output_parser=parser)
tasks = [
async_generate(i, chain, sender, receiver, field, sender_query, company)
for _ in range(5)
]
await asyncio.gather(*tasks)
s = time.perf_counter()
index = 0
while True:
try:
await generate_concurrently(index)
if index > 5000:
break
index += 1
except Exception as e:
print(e)
print('\n')
# raise e
pass
elapsed = time.perf_counter() - s
print("\033[1m" + f"Concurrent executed in {elapsed:0.2f} seconds." + "\033[0m")
Results¶
This should create mails like:
{
"sender": "Elisa",
"sender_email_content": "Dear Tommy, I hope this email finds you well. My name is Elisa, and I am reaching out to you in hopes of collaborating on a software engineering project. I believe that our combined skills and expertise could lead to a successful partnership. I look forward to hearing from you soon. Best regards, Elisa - A better film-rental-company",
"answer_email_content": "Dear Elisa, Thank you for reaching out to me. I am excited about the opportunity to collaborate with you on a software engineering project. Your passion and dedication to your work are truly inspiring, and I am confident that together we can achieve great things. I am looking forward to discussing the details of our collaboration further. Best regards, Tommy"
}
Further steps¶
For fine-tuning datasets, these array of json-objects can now be brought into a llm-format like:
mistral/llma2:¶
template = f"<s>[INST]{sender_email_content}[/INST] {answer_email_content}</s>"
llma3:¶
template = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|> you are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>{sender_email_content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>{answer_email_content}<|eot_id|>"