-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_summarisation.py
121 lines (90 loc) · 3.96 KB
/
text_summarisation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""
Created on : 03 - 01 - 2023 ( Tuesday )
Source :
1. https://towardsdatascience.com/understand-text-summarization-and-create-your-own-summarizer-in-python-b26a9f09fc70
2. https://towardsdatascience.com/introduction-to-text-summarization-with-rouge-scores-84140c64b471
3. https://github.com/m-chanakya/shortstories
Created by : Dhanya. B
Topic : Text Summarisation with NLP
Algorithm working:
1. Split the passage into sentences and leave only the alphabets behind.
2. Now, create a zero matrix in size = n x n , where n - number of sentences.
3. Then calculate the similarity between each sentence and for same sentences the value is 0.
4. Similarity is found by,
- lowercasing the sentence
- finding unique words from both sentences
- removing stopwords
- incrementing the count by each occurrence
- then we find the (cosine distance --> ?) and use cos_simi = 1 - cosine distance
5. Then we use the networkx lib to do the following
- creating graph using nx.from_numpy_array()
- finding the scores using nx.pagerank()
6. Rank the sentences in descending order
7. Create the summary
8. Send the summary to find the rouge score of the candidate and reference summaries
"""
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
from rouge import Rouge
ROUGE = Rouge()
def read_article(file_name):
article = file_name.split(". ")
sentences = []
for sentence in article:
print(sentence)
sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
sentences.pop()
return sentences
def sentence_similarity(sent1, sent2, stopwords=None):
if stopwords is None:
stopwords = []
sent1 = [w.lower() for w in sent1]
sent2 = [w.lower() for w in sent2]
all_words = list(set(sent1 + sent2))
vector1 = [0] * len(all_words)
vector2 = [0] * len(all_words)
# build the vector for the first sentence
for w in sent1:
if w in stopwords:
continue
vector1[all_words.index(w)] += 1
# build the vector for the second sentence
for w in sent2:
if w in stopwords:
continue
vector2[all_words.index(w)] += 1
return 1 - cosine_distance(vector1, vector2)
def build_similarity_matrix(sentences, stop_words):
# Create an empty similarity matrix
similarity_matrix = np.zeros((len(sentences), len(sentences)))
for idx1 in range(len(sentences)):
for idx2 in range(len(sentences)):
if idx1 == idx2: #ignore if both are same sentences
continue
similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
return similarity_matrix
def generated_rouge(summary_text, ref_text):
score = ROUGE.get_scores(summary_text, ref_text)
return score
def generate_summary(file_name, top_n):
stop_words = stopwords.words('english')
summarize_text = []
# Step 1 - Read text anc split it
sentences = read_article(file_name)
# Step 2 - Generate Similary Martix across sentences
sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
scores = nx.pagerank(sentence_similarity_graph)
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
# print("Indexes of top ranked_sentence order are ", ranked_sentence)
for i in range(top_n):
summarize_text.append(" ".join(ranked_sentence[i][1]))
# Step 5 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))
summary_text = ". ".join(summarize_text)
return summary_text
#generated_rouge(summary_text)