srikanththirumani commited on
Commit
8c61de9
1 Parent(s): 5ae9fd6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -17
app.py CHANGED
@@ -10,8 +10,6 @@ import spacy
10
  import matplotlib.pyplot as plt
11
  import io
12
  import base64
13
- from sklearn.feature_extraction.text import TfidfVectorizer
14
- import numpy as np
15
 
16
  # Load spaCy model for semantic analysis
17
  nlp = spacy.load("en_core_web_md")
@@ -37,11 +35,7 @@ def preprocess_text(text):
37
  tokens = word_tokenize(text)
38
  return [word for word in tokens if word not in stop_words]
39
 
40
- def calculate_word_similarity(text1, text2):
41
- words1 = preprocess_text(text1)
42
- words2 = preprocess_text(text2)
43
- vec1 = Counter(words1)
44
- vec2 = Counter(words2)
45
  intersection = set(vec1.keys()) & set(vec2.keys())
46
  numerator = sum([vec1[x] * vec2[x] for x in intersection])
47
  sum1 = sum([vec1[x]**2 for x in vec1.keys()])
@@ -50,17 +44,29 @@ def calculate_word_similarity(text1, text2):
50
  if not denominator:
51
  return 0.0
52
  else:
53
- return float(numerator) / denominator * 100
 
 
 
 
 
 
 
 
54
 
55
  def calculate_sentence_similarity(text1, text2):
56
  sentences1 = sent_tokenize(text1)
57
  sentences2 = sent_tokenize(text2)
58
- tfidf_vectorizer = TfidfVectorizer()
59
- tfidf_matrix1 = tfidf_vectorizer.fit_transform(sentences1)
60
- tfidf_matrix2 = tfidf_vectorizer.transform(sentences2)
61
- cosine_similarities = (tfidf_matrix1 * tfidf_matrix2.T).A
62
- avg_similarity = np.mean(np.max(cosine_similarities, axis=1)) * 100
63
- return avg_similarity
 
 
 
 
64
 
65
  def semantic_similarity(text1, text2):
66
  doc1 = nlp(text1)
@@ -74,15 +80,15 @@ def longest_common_subsequence(text1, text2):
74
  L = [[0] * (n + 1) for _ in range(m + 1)]
75
  for i in range(1, m + 1):
76
  for j in range(1, n + 1):
77
- if sentences1[i-1] == sentences2[j-1]:
78
  L[i][j] = L[i-1][j-1] + 1
79
  else:
80
  L[i][j] = max(L[i-1][j], L[i][j-1])
81
  lcs = []
82
  i, j = m, n
83
  while i > 0 and j > 0:
84
- if sentences1[i-1] == sentences2[j-1]:
85
- lcs.append(sentences1[i-1])
86
  i -= 1
87
  j -= 1
88
  elif L[i-1][j] > L[i][j-1]:
 
10
  import matplotlib.pyplot as plt
11
  import io
12
  import base64
 
 
13
 
14
  # Load spaCy model for semantic analysis
15
  nlp = spacy.load("en_core_web_md")
 
35
  tokens = word_tokenize(text)
36
  return [word for word in tokens if word not in stop_words]
37
 
38
+ def cosine_similarity(vec1, vec2):
 
 
 
 
39
  intersection = set(vec1.keys()) & set(vec2.keys())
40
  numerator = sum([vec1[x] * vec2[x] for x in intersection])
41
  sum1 = sum([vec1[x]**2 for x in vec1.keys()])
 
44
  if not denominator:
45
  return 0.0
46
  else:
47
+ return float(numerator) / denominator
48
+
49
+ def calculate_word_similarity(text1, text2):
50
+ words1 = preprocess_text(text1)
51
+ words2 = preprocess_text(text2)
52
+ vec1 = Counter(words1)
53
+ vec2 = Counter(words2)
54
+ similarity = cosine_similarity(vec1, vec2)
55
+ return similarity * 100
56
 
57
  def calculate_sentence_similarity(text1, text2):
58
  sentences1 = sent_tokenize(text1)
59
  sentences2 = sent_tokenize(text2)
60
+ similarities = []
61
+ for sent1 in sentences1:
62
+ max_similarity = 0
63
+ for sent2 in sentences2:
64
+ similarity = calculate_word_similarity(sent1, sent2)
65
+ if similarity > max_similarity:
66
+ max_similarity = similarity
67
+ similarities.append(max_similarity)
68
+ average_similarity = sum(similarities) / len(similarities) if similarities else 0.0
69
+ return average_similarity
70
 
71
  def semantic_similarity(text1, text2):
72
  doc1 = nlp(text1)
 
80
  L = [[0] * (n + 1) for _ in range(m + 1)]
81
  for i in range(1, m + 1):
82
  for j in range(1, n + 1):
83
+ if sentences2[j-1] in sentences1:
84
  L[i][j] = L[i-1][j-1] + 1
85
  else:
86
  L[i][j] = max(L[i-1][j], L[i][j-1])
87
  lcs = []
88
  i, j = m, n
89
  while i > 0 and j > 0:
90
+ if sentences2[j-1] in sentences1:
91
+ lcs.append(sentences2[j-1])
92
  i -= 1
93
  j -= 1
94
  elif L[i-1][j] > L[i][j-1]: