Debug School

rakesh kumar
rakesh kumar

Posted on

Real Time Application of Nlp using Cosine Similarity

Applications
Text Similarity:

Measures how similar two texts are semantically using cosine similarity on embeddings.
Plagiarism Detection:

Compares a student's text with source materials to detect potential plagiarism.
Semantic Search:

Finds the most relevant documents in a collection based on a query.
Question-Answer Relevance:

Evaluates if an answer is relevant to a given question.
Content Recommendation:

Suggests content (e.g., articles, products) based on user input.
Duplicate Detection:

Identifies duplicate or near-duplicate entries in a dataset.
Document Clustering:

Groups documents with similar content by calculating pairwise cosine similarity.
Chatbot Response Ranking:

Ranks chatbot responses based on relevance to the user's query.
Resume and Job Description Matching:

Measures the similarity between a resume and a job description.
Brand Sentiment Matching:

Evaluates how well a customer's review aligns with a brand's tone or messaging.

Real-Time NLP Applications Using Cosine Similarity

from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load a transformer pipeline for embeddings
embedder = pipeline("feature-extraction", model="sentence-transformers/all-mpnet-base-v2")

# Helper function to compute cosine similarity
def compute_cosine_similarity(text1, text2):
    embedding1 = np.mean(embedder(text1), axis=1)  # Average over tokens
    embedding2 = np.mean(embedder(text2), axis=1)
    return cosine_similarity(embedding1, embedding2)[0][0]

# 1. Text Similarity
def text_similarity(text1, text2):
    similarity = compute_cosine_similarity(text1, text2)
    return {"text1": text1, "text2": text2, "similarity": similarity}

# 2. Plagiarism Detection
def plagiarism_detection(student_text, source_texts):
    results = []
    for source in source_texts:
        similarity = compute_cosine_similarity(student_text, source)
        results.append({"source": source, "similarity": similarity})
    return sorted(results, key=lambda x: x["similarity"], reverse=True)

# 3. Semantic Search
def semantic_search(query, documents):
    results = []
    for doc in documents:
        similarity = compute_cosine_similarity(query, doc)
        results.append({"document": doc, "similarity": similarity})
    return sorted(results, key=lambda x: x["similarity"], reverse=True)

# 4. Question-Answer Relevance
def question_answer_relevance(question, answer):
    similarity = compute_cosine_similarity(question, answer)
    return {"question": question, "answer": answer, "relevance": similarity}

# 5. Content Recommendation
def content_recommendation(user_input, content_database):
    recommendations = []
    for content in content_database:
        similarity = compute_cosine_similarity(user_input, content)
        recommendations.append({"content": content, "similarity": similarity})
    return sorted(recommendations, key=lambda x: x["similarity"], reverse=True)

# 6. Duplicate Detection
def duplicate_detection(text, dataset):
    duplicates = []
    for data in dataset:
        similarity = compute_cosine_similarity(text, data)
        if similarity > 0.9:  # Adjust threshold as needed
            duplicates.append(data)
    return duplicates

# 7. Document Clustering
def document_clustering(documents):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return cosine_sim

# 8. Chatbot Response Ranking
def chatbot_response_ranking(user_query, responses):
    ranked_responses = []
    for response in responses:
        similarity = compute_cosine_similarity(user_query, response)
        ranked_responses.append({"response": response, "similarity": similarity})
    return sorted(ranked_responses, key=lambda x: x["similarity"], reverse=True)

# 9. Resume and Job Description Matching
def resume_job_matching(resume, job_description):
    similarity = compute_cosine_similarity(resume, job_description)
    return {"resume": resume, "job_description": job_description, "similarity": similarity}

# 10. Brand Sentiment Matching
def brand_sentiment_matching(customer_review, brand_tone):
    similarity = compute_cosine_similarity(customer_review, brand_tone)
    return {"review": customer_review, "brand_tone": brand_tone, "alignment": similarity}

# Example usage
if __name__ == "__main__":
    text1 = "Artificial intelligence is transforming industries."
    text2 = "Machine learning is a subset of AI that impacts various sectors."
    documents = [
        "AI is evolving quickly.",
        "Healthcare uses machine learning to improve outcomes.",
        "Finance relies on data science for better predictions.",
    ]
    student_text = "AI is the future of technology."
    source_texts = [
        "Artificial intelligence is the technology of the future.",
        "AI will transform the world.",
    ]
    user_query = "What are the benefits of AI?"
    chatbot_responses = [
        "AI helps automate tasks.",
        "It can analyze data efficiently.",
        "AI provides personalized recommendations.",
    ]

    print("1. Text Similarity:", text_similarity(text1, text2))
    print("2. Plagiarism Detection:", plagiarism_detection(student_text, source_texts))
    print("3. Semantic Search:", semantic_search("AI and finance", documents))
    print("4. Question-Answer Relevance:", question_answer_relevance("What is AI?", "AI is artificial intelligence."))
    print("5. Content Recommendation:", content_recommendation("AI in healthcare", documents))
    print("6. Duplicate Detection:", duplicate_detection("AI is the future of technology.", source_texts))
    print("7. Document Clustering:", document_clustering(documents))
    print("8. Chatbot Response Ranking:", chatbot_response_ranking(user_query, chatbot_responses))
    print("9. Resume and Job Description Matching:", resume_job_matching("I am skilled in AI.", "Looking for AI expertise."))
    print("10. Brand Sentiment Matching:", brand_sentiment_matching("I love how innovative th")
Enter fullscreen mode Exit fullscreen mode
from sklearn.metrics.pairwise import cosine_similarity
import spacy

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Pre-defined profane words for similarity (seed words)
profane_seeds = ["badword", "offensive", "vulgar"]

# Generate embeddings for the seed words
seed_embeddings = [nlp(word).vector for word in profane_seeds]

def profanity_filter(doc):
    tokens = []
    for token in doc:
        # Compute cosine similarity with the seed words
        token_vector = token.vector
        similarities = cosine_similarity([token_vector], seed_embeddings)
        max_similarity = max(similarities[0])  # Get the highest similarity score

        # Replace the token if similarity exceeds a threshold
        if max_similarity > 0.8:  # Adjust the threshold as needed
            tokens.append("***")
        else:
            tokens.append(token.text)

    # Store the censored text in a custom extension
    doc._.censored_text = " ".join(tokens)
    return doc

# Register the custom extension
from spacy.tokens import Doc
Doc.set_extension("censored_text", default=None)

# Add the profanity filter to the pipeline
nlp.add_pipe(profanity_filter, last=True)

# Test the pipeline
text = "This is a really vulgar word and offensive language."
doc = nlp(text)
print(f"Censored Text: {doc._.censored_text}")
Enter fullscreen mode Exit fullscreen mode

Top comments (0)