convert multiple resumes from a file (Excel, Word, or plain text) or a long paragraph into a list of individual resume texts
convert multiple resumes from a file (Excel, Word, or plain text) or a long paragraph into a list of individual resume texts using langchain
convert multiple resumes from a file (Excel, Word, or plain text) or a long paragraph into a list of individual resume texts
For Excel File
If resumes are stored in an Excel file, each row contains a resume or parts of resumes:
import pandas as pd
def extract_resumes_from_excel(file_path, column_name):
# Load the Excel file
df = pd.read_excel(file_path)
# Extract the resumes from the specified column
resume_texts = df[column_name].dropna().tolist()
return resume_texts
# Example usage
file_path = "resumes.xlsx" # Replace with your file path
column_name = "ResumeText" # Replace with the column containing resumes
resume_texts = extract_resumes_from_excel(file_path, column_name)
parsed_resumes = process_resumes(resume_texts)
For Plain Text File
If resumes are stored in a plain text file separated by a specific delimiter (e.g., ---), you can split them:
Code:
def extract_resumes_from_txt(file_path, delimiter="---"):
with open(file_path, "r") as file:
text = file.read()
# Split the text into individual resumes
resume_texts = text.split(delimiter)
return [resume.strip() for resume in resume_texts if resume.strip()]
# Example usage
file_path = "resumes.txt" # Replace with your file path
resume_texts = extract_resumes_from_txt(file_path)
parsed_resumes = process_resumes(resume_texts)
- For Word Document (DOCX) If resumes are stored in a Word file, use the python-docx library to extract text:
Code:
from docx import Document
def extract_resumes_from_docx(file_path, delimiter="---"):
# Load the Word document
doc = Document(file_path)
# Extract text from all paragraphs
text = "\n".join([p.text for p in doc.paragraphs])
# Split resumes using delimiter
resume_texts = text.split(delimiter)
return [resume.strip() for resume in resume_texts if resume.strip()]
# Example usage
file_path = "resumes.docx" # Replace with your file path
resume_texts = extract_resumes_from_docx(file_path)
parsed_resumes = process_resumes(resume_texts)
- For PDF File If resumes are stored in a PDF file, use the PyPDF2 or PyMuPDF libraries to extract text:
import PyPDF2
def extract_resumes_from_pdf(file_path, delimiter="---"):
with open(file_path, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
# Split resumes using delimiter
resume_texts = text.split(delimiter)
return [resume.strip() for resume in resume_texts if resume.strip()]
# Example usage
file_path = "resumes.pdf" # Replace with your file path
resume_texts = extract_resumes_from_pdf(file_path)
parsed_resumes = process_resumes(resume_texts)
- For Paragraph or Long String If resumes are stored as a single string with clear separations (e.g., \n\n or ---):
Code:
def extract_resumes_from_paragraph(text, delimiter="---"):
# Split the string into individual resumes
resume_texts = text.split(delimiter)
return [resume.strip() for resume in resume_texts if resume.strip()]
# Example input
long_string = """
John Doe
Email: john.doe@example.com
Phone: 9876543210
Skills: Python, Machine Learning
---
Jane Smith
Email: jane.smith@sample.com
Phone: 1234567890
Skills: Java, NLP, SQL
"""
resume_texts = extract_resumes_from_paragraph(long_string)
parsed_resumes = process_resumes(resume_texts)
Final Steps:
Use the relevant function based on the file type (Excel, TXT, DOCX, PDF) or input format.
Convert the extracted text into a list (resume_texts).
Process the list using your existing process_resumes function.
This makes your resume parsing pipeline flexible and adaptable to various input formats.
convert multiple resumes from a file (Excel, Word, or plain text) or a long paragraph into a list of individual resume texts using langchain
Install Necessary Libraries
Before proceeding, ensure that you have LangChain and other required libraries installed:
pip install langchain pandas python-docx PyPDF2 openpyxl
- Using LangChain Document Loaders LangChain offers built-in loaders for various file types such as text files, PDFs, Word documents, and more. You can use these loaders to read and split your resumes.
Example for Processing Multiple Resumes:
Code for Supported Formats:
from langchain.document_loaders import TextLoader, PyPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def load_resumes(file_path, file_type):
if file_type == "txt":
# Load text file
loader = TextLoader(file_path)
elif file_type == "pdf":
# Load PDF file
loader = PyPDFLoader(file_path)
elif file_type == "docx":
# Load Word document
loader = UnstructuredWordDocumentLoader(file_path)
else:
raise ValueError("Unsupported file type. Use 'txt', 'pdf', or 'docx'.")
# Load documents into LangChain
documents = loader.load()
return documents
def split_resumes(documents, chunk_size=1000, chunk_overlap=100):
# Initialize LangChain text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
split_docs = splitter.split_documents(documents)
return split_docs
# Example usage:
file_path = "resumes.docx" # Replace with your file path
file_type = "docx" # Supported types: 'txt', 'pdf', 'docx'
# Load resumes
documents = load_resumes(file_path, file_type)
# Split resumes into manageable chunks
split_documents = split_resumes(documents)
# Access the split text chunks
resume_texts = [doc.page_content for doc in split_documents]
# Process the split resumes
parsed_resumes = process_resumes(resume_texts)
Using LangChain to Process Excel Files
For Excel files, LangChain does not have a direct loader, so you can use pandas to load data and process it.
Code for Excel Files:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
def load_resumes_from_excel(file_path, column_name):
# Load the Excel file using pandas
df = pd.read_excel(file_path)
# Extract resumes from the specified column
resumes = df[column_name].dropna().tolist()
# Convert each resume into a LangChain Document object
documents = [Document(page_content=resume) for resume in resumes]
return documents
# Example usage:
file_path = "resumes.xlsx" # Replace with your file path
column_name = "ResumeText" # Replace with the column containing resumes
# Load resumes
documents = load_resumes_from_excel(file_path, column_name)
# Split resumes
split_documents = split_resumes(documents)
# Convert split documents to text
resume_texts = [doc.page_content for doc in split_documents]
# Process the resumes
parsed_resumes = process_resumes(resume_texts)
Using LangChain for Paragraph Splitting
If resumes are stored as a single string (e.g., a long paragraph or text file), you can use LangChain's text splitter directly.
Code for Paragraph Splitting:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
def load_resumes_from_paragraph(text, delimiter="---"):
# Split the input text into individual resumes
resumes = text.split(delimiter)
# Convert each resume into a LangChain Document object
documents = [Document(page_content=resume.strip()) for resume in resumes if resume.strip()]
return documents
# Example input
long_string = """
John Doe
Email: john.doe@example.com
Phone: 9876543210
Skills: Python, Machine Learning
---
Jane Smith
Email: jane.smith@sample.com
Phone: 1234567890
Skills: Java, NLP, SQL
"""
# Load resumes from paragraph
documents = load_resumes_from_paragraph(long_string)
# Split resumes
split_documents = split_resumes(documents)
# Convert split documents to text
resume_texts = [doc.page_content for doc in split_documents]
# Process the resumes
parsed_resumes = process_resumes(resume_texts)
Top comments (0)