-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTFIDF_model.py
More file actions
104 lines (87 loc) · 3.83 KB
/
TFIDF_model.py
File metadata and controls
104 lines (87 loc) · 3.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 11 01:45:37 2021
@author: Aditi.Dhamat
"""
import regex as re
import nltk
import heapq
import numpy as np
paragraph = """Thank you all so very much.
Thank you to the Academy. Thank you to all of you in
this room. I have to congratulate the other
incredible nominees this year. The Revenant was the
product of the tireless efforts of an unbelievable
cast and crew. First off, to my brother in this
endeavor, Mr. Tom Hardy. Tom, your talent on screen
can only be surpassed by your friendship off screen.
Thank you for creating a transcendent cinematic experience.
Thank you to everybody at Fox and New Regency …
my entire team. I have to thank everyone from the
very onset of my career … To my parents; none of this
would be possible without you. And to my friends,
I love you dearly; you know who you are.And lastly,
I just want to say this: Making The Revenant was
about man's relationship to the natural world.
A world that we collectively felt in 2015 as the
hottest year in recorded history. Our production
needed to move to the southern tip of this planet
just to be able to find snow. Climate change is real,
it is happening right now. It is the most urgent
threat facing our entire species, and we need to work
collectively together and stop procrastinating. We
need to support leaders around the world who do not
speak for the big polluters, but who speak for all of
humanity, for the indigenous people of the world, for
the billions and billions of underprivileged people
out there who would be most affected by this. For our
children’s children, and for those people out there
whose voices have been drowned out by the politics of
greed. I thank you all for this amazing award tonight.
Let us not take this planet for granted. I do not
take tonight for granted. Thank you so very much."""
sentences = nltk.sent_tokenize(paragraph)
for i in range(len(sentences)):
sentences[i] = sentences[i].lower()
sentences[i] = re.sub(r"\W", " ", sentences[i])
sentences[i] = re.sub(r"\s+", " ", sentences[i])
#Create a histogram
wordcount = {}
for data in sentences:
words = nltk.word_tokenize(data)
for word in words:
if word not in wordcount.keys():
wordcount[word] = 1
else:
wordcount[word] += 1
freq_words = heapq.nlargest(100, wordcount, key=wordcount.get)
#IDF for each word
word_idf = {}
for word in freq_words:
doc_count = 0
for data in sentences:
if word in nltk.word_tokenize(data):
doc_count += 1
word_idf[word] = np.log((len(sentences)/doc_count)+1) #+1 for bias
#TF for each word in each document
tf_matrix = {}
for word in freq_words:
doc_tf = []
for data in sentences:
freq = 0
for w in nltk.word_tokenize(data):
if w == word:
freq += 1
tf_word = freq / len(nltk.word_tokenize(data))
doc_tf.append(tf_word)
tf_matrix[word] = doc_tf
#TF-IDF
tfidf_matrix = []
for word in tf_matrix.keys():
tfidf = []
for value in tf_matrix[word]:
score = value * word_idf[word]
tfidf.append(score)
tfidf_matrix.append(tfidf)
X = np.asarray(tfidf_matrix)
X = np.transpose(X)