Applications
Text Similarity
:
Measures how similar two texts are semantically using cosine similarity on embeddings.
Plagiarism Detection
:
Compares a student's text with source materials to detect potential plagiarism.
Semantic Search
:
Finds the most relevant documents in a collection based on a query.
Question-Answer Relevance
:
Evaluates if an answer is relevant to a given question.
Content Recommendation
:
Suggests content (e.g., articles, products) based on user input.
Duplicate Detection
:
Identifies duplicate or near-duplicate entries in a dataset.
Document Clustering
:
Groups documents with similar content by calculating pairwise cosine similarity.
Chatbot Response Ranking
:
Ranks chatbot responses based on relevance to the user's query.
Resume and Job Description Matching
:
Measures the similarity between a resume and a job description.
Brand Sentiment Matching
:
Evaluates how well a customer's review aligns with a brand's tone or messaging.
Real-Time NLP Applications Using Cosine Similarity
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
# Load a transformer pipeline for embeddings
embedder = pipeline("feature-extraction", model="sentence-transformers/all-mpnet-base-v2")
# Helper function to compute cosine similarity
def compute_cosine_similarity(text1, text2):
embedding1 = np.mean(embedder(text1), axis=1) # Average over tokens
embedding2 = np.mean(embedder(text2), axis=1)
return cosine_similarity(embedding1, embedding2)[0][0]
# 1. Text Similarity
def text_similarity(text1, text2):
similarity = compute_cosine_similarity(text1, text2)
return {"text1": text1, "text2": text2, "similarity": similarity}
# 2. Plagiarism Detection
def plagiarism_detection(student_text, source_texts):
results = []
for source in source_texts:
similarity = compute_cosine_similarity(student_text, source)
results.append({"source": source, "similarity": similarity})
return sorted(results, key=lambda x: x["similarity"], reverse=True)
# 3. Semantic Search
def semantic_search(query, documents):
results = []
for doc in documents:
similarity = compute_cosine_similarity(query, doc)
results.append({"document": doc, "similarity": similarity})
return sorted(results, key=lambda x: x["similarity"], reverse=True)
# 4. Question-Answer Relevance
def question_answer_relevance(question, answer):
similarity = compute_cosine_similarity(question, answer)
return {"question": question, "answer": answer, "relevance": similarity}
# 5. Content Recommendation
def content_recommendation(user_input, content_database):
recommendations = []
for content in content_database:
similarity = compute_cosine_similarity(user_input, content)
recommendations.append({"content": content, "similarity": similarity})
return sorted(recommendations, key=lambda x: x["similarity"], reverse=True)
# 6. Duplicate Detection
def duplicate_detection(text, dataset):
duplicates = []
for data in dataset:
similarity = compute_cosine_similarity(text, data)
if similarity > 0.9: # Adjust threshold as needed
duplicates.append(data)
return duplicates
# 7. Document Clustering
def document_clustering(documents):
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
return cosine_sim
# 8. Chatbot Response Ranking
def chatbot_response_ranking(user_query, responses):
ranked_responses = []
for response in responses:
similarity = compute_cosine_similarity(user_query, response)
ranked_responses.append({"response": response, "similarity": similarity})
return sorted(ranked_responses, key=lambda x: x["similarity"], reverse=True)
# 9. Resume and Job Description Matching
def resume_job_matching(resume, job_description):
similarity = compute_cosine_similarity(resume, job_description)
return {"resume": resume, "job_description": job_description, "similarity": similarity}
# 10. Brand Sentiment Matching
def brand_sentiment_matching(customer_review, brand_tone):
similarity = compute_cosine_similarity(customer_review, brand_tone)
return {"review": customer_review, "brand_tone": brand_tone, "alignment": similarity}
# Example usage
if __name__ == "__main__":
text1 = "Artificial intelligence is transforming industries."
text2 = "Machine learning is a subset of AI that impacts various sectors."
documents = [
"AI is evolving quickly.",
"Healthcare uses machine learning to improve outcomes.",
"Finance relies on data science for better predictions.",
]
student_text = "AI is the future of technology."
source_texts = [
"Artificial intelligence is the technology of the future.",
"AI will transform the world.",
]
user_query = "What are the benefits of AI?"
chatbot_responses = [
"AI helps automate tasks.",
"It can analyze data efficiently.",
"AI provides personalized recommendations.",
]
print("1. Text Similarity:", text_similarity(text1, text2))
print("2. Plagiarism Detection:", plagiarism_detection(student_text, source_texts))
print("3. Semantic Search:", semantic_search("AI and finance", documents))
print("4. Question-Answer Relevance:", question_answer_relevance("What is AI?", "AI is artificial intelligence."))
print("5. Content Recommendation:", content_recommendation("AI in healthcare", documents))
print("6. Duplicate Detection:", duplicate_detection("AI is the future of technology.", source_texts))
print("7. Document Clustering:", document_clustering(documents))
print("8. Chatbot Response Ranking:", chatbot_response_ranking(user_query, chatbot_responses))
print("9. Resume and Job Description Matching:", resume_job_matching("I am skilled in AI.", "Looking for AI expertise."))
print("10. Brand Sentiment Matching:", brand_sentiment_matching("I love how innovative th")
from sklearn.metrics.pairwise import cosine_similarity
import spacy
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")
# Pre-defined profane words for similarity (seed words)
profane_seeds = ["badword", "offensive", "vulgar"]
# Generate embeddings for the seed words
seed_embeddings = [nlp(word).vector for word in profane_seeds]
def profanity_filter(doc):
tokens = []
for token in doc:
# Compute cosine similarity with the seed words
token_vector = token.vector
similarities = cosine_similarity([token_vector], seed_embeddings)
max_similarity = max(similarities[0]) # Get the highest similarity score
# Replace the token if similarity exceeds a threshold
if max_similarity > 0.8: # Adjust the threshold as needed
tokens.append("***")
else:
tokens.append(token.text)
# Store the censored text in a custom extension
doc._.censored_text = " ".join(tokens)
return doc
# Register the custom extension
from spacy.tokens import Doc
Doc.set_extension("censored_text", default=None)
# Add the profanity filter to the pipeline
nlp.add_pipe(profanity_filter, last=True)
# Test the pipeline
text = "This is a really vulgar word and offensive language."
doc = nlp(text)
print(f"Censored Text: {doc._.censored_text}")
Top comments (0)