-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpreprocessing_nltk.py
More file actions
123 lines (98 loc) · 4.39 KB
/
preprocessing_nltk.py
File metadata and controls
123 lines (98 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
@author: Jorge Vasquez,Colm Rooney,Sahil Chopra
"""
# import modules
import argparse
import os
import string
import pandas as pd
# import spacy
# from spacy import displacy
from pprint import pprint
import nltk
from nltk.corpus import stopwords
class Preprocessor:
def __init__(self, isLowercase = True, isTokenize = True, isStop=True, isNums = True, isPunct=True, isPostagging=False, isNER=False):
self.isLowercase = isLowercase
self.isTokenize = isTokenize
self.isNums = isNums
self.isStop = isStop
self.isPunct = isPunct
self.isPostagging = isPostagging
self.isNER = isNER
self.stop_words = set(stopwords.words('english'))
def transform(self, texts):
if self.isLowercase:
print(">>>>> LOWER CASING ...")
texts = (list(map(self.lower_case, texts)))
if self.isNums:
print(">>>>> REMOVING NUMBERS ...")
texts = (list(map(self.remove_nums, texts)))
if self.isPunct:
print(">>>>> REMOVING PUNCTUATION ...")
texts = (list(map(self.remove_punct, texts)))
if self.isTokenize:
print(">>>>> TOKENIZING ...")
tokens = (list(map(self.tokenize, texts)))
if self.isStop:
print(">>>>> REMOVING STOP WORDS ...")
tokens = (list(map(self.remove_stop, tokens)))
preprocessed_texts = [" ".join(token) for token in tokens]
print("TRANSFORMATION COMPLETE !")
return preprocessed_texts
#sentence segmentation
def segment_sentences(text):
sentences = []
for sentence in text:
sentences.append(sentence)
return sentences
def tokenize(self, text):
return nltk.word_tokenize(text)
def lower_case(self, text):
return text.lower()
def remove_nums(self, text):
nums_translator = str.maketrans('', '', '0123456789')
return text.translate(nums_translator)
#remove stop words & punctuation & lowercase
def remove_stop(self, tokens):
return [token for token in tokens if token not in self.stop_words]
def remove_punct(self, text):
punct_translator = str.maketrans('', '', string.punctuation)
return text.translate(punct_translator)
# #POS Taggig each token
# def postagging(tokens):
# spacy_pos_tagged = [(token, token.tag_, token.pos_) for token in tokens]
# return spacy_pos_tagged
# #Apply N.E.R to improve classification algorithm
# def name_entity_recognition(texts):
# sentences = segment_sentences(texts)
# for sentence in sentences:
# print("NEs:", [ne for ne in sentence.ents])
# displacy.render(sentence, style='ent', jupyter=True)
# return
def main(inputpath, outputpath):
if os.path.isfile(inputpath):
df = pd.read_csv(inputpath)
else:
raise FileNotFoundError(f'File {inputpath} is not found. Retry with another name')
preprocessor = Preprocessor()
if df['description'].isna().any():
newdf = df.dropna()
newdf.reset_index(drop=True, inplace=True)
print('>>>>> Wikipage Text processing...')
newdf['preprocessed_page_content'] = preprocessor.transform(newdf['page_content'])
print(">>>>> Wikidata description processing ...")
newdf['preprocessed_description'] = preprocessor.transform(newdf['description'])
preprocessed_df = newdf[['category', 'page_content', 'preprocessed_page_content', 'description', 'preprocessed_description']]
columns = ['Person', 'Wikipage', 'Preprocessed Wikipage', 'Description', 'Preprocessed Description']
preprocessed_df.columns = columns
#output as csv
preprocessed_df.to_csv(outputpath, index=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Preprocessor")
parser.add_argument("--input", type=str, default='data/extracted_data.csv',
help="Please provide the pathname to the csv file obtained after data extraction")
parser.add_argument("--output", type=str, default='data/preprocessed_data.csv',
help="Please provide an output path for the preprocessed data")
args = parser.parse_args()
main(args.input, args.output)