Reuters_1987_Classification/FileReader.py at master · Plozel/Reuters_1987_Classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import logging
import math
import re

import numpy as np

logger = logging.getLogger(__name__)


class FileReader:
    def __init__(self, data_articles, is_test=False):
        self.data_articles = data_articles
        self.is_test = is_test
        self.number_of_docs = len(self.data_articles)
        self.df = {}
        self.words = {}
        self.stop_words = []
        self.labels = {}
        self.create_stop_words_list()
        self.create_words_bank()
        self.inv_labels = {v: k for k, v in self.labels.items()}

    def create_stop_words_list(self):
        logger.info("Creating stop words list...")
        with open("Data/stop_words.txt") as stop_words_file:
            for stop_word in stop_words_file:
                self.stop_words.append(stop_word.rstrip())

    def pre_process(self, word):
        """
        Preprocess a word by turning it to lower case,
        removes some punctuation and returns the word or
        empty string if it is a stop word
        :param word: string
        :return: updated word - string
        """
        strip_special_chars = re.compile("[^A-Za-z ]+")
        word = word.lower().replace("<br />", " ")
        word = re.sub(strip_special_chars, "", word.lower())
        # returns empty string if word is a stop word
        if word in self.stop_words:
            return ""
        return word

    def create_words_bank(self):
        """
        create_words_bank from all the articles and pull of labels
        :return:
        """
        logger.info("Creating words bank...")

        index = 0
        index2 = 0
        for article in self.data_articles:
            seen_in_this_article = []
            for word in article["text"].split():
                word = self.pre_process(word)
                if word == "":
                    continue
                if word not in self.df:
                    self.df[word] = 1  # document frequency
                    seen_in_this_article.append(word)
                if word not in seen_in_this_article:
                    self.df[word] += 1
                    seen_in_this_article.append(word)
                if (
                        word not in self.words.keys()
                ):  # if the word doesnt already exists in the words dictionary
                    self.words[word] = index  # add it
                    index += 1
            # create pull labels
            for label in article["labels"]:
                if (
                        label not in self.labels.keys()
                ):  # if the label doesnt already exists in the labels dictionary
                    self.labels[label] = index2  # add it
                    index2 += 1

    # noinspection DuplicatedCode
    def build_set_tfidf(self):
        """
        Builds the data vector using tfidf format
        :return: the file in vector form, using tfidf format
        """
        logger.info("Building tfidf set...")

        doc_set = []
        labels_set = []
        for article in self.data_articles:
            vec = len(self.words) * [0.0]
            for word in article["text"].split():
                word = self.pre_process(word)
                if word == "":
                    continue
                vec[self.words[word]] += 1
            # After iterating over all words we now have the tf and can store words in tfidf format
            for word in self.words.keys():
                index = self.words[word]
                if vec[index] == 0:
                    continue
                else:
                    vec[index] *= math.log((self.number_of_docs / self.df[word]), 10)
            doc_set.append(vec)
            # return np.array(doc_set)
            vec_labels = len(self.labels) * [0]
            for label in article["labels"]:
                vec_labels[self.labels[label]] = 1
            labels_set.append(vec_labels)
        return np.array(doc_set), np.array(labels_set)
        # return np.array(doc_set), pickle.load(open("train_labels.p", 'rb'))

    def parse_test(self, test_articles, debug=False):
        """
        Builds the data vector using tfidf format
        :param test_articles: the file to be processed
        :param debug: if True returns the labels of the articles as well, for referencing prediction
        :return: the file in vector form, using tfidf format
        """
        logger.info("Parsing (tfidf) test data...")
        doc_set = []
        labels_set = []

        # noinspection DuplicatedCode
        for article in test_articles:
            vec = len(self.words) * [0]
            for word in article["text"].split():
                word = self.pre_process(word)
                if word == "":
                    continue
                if word in self.words:
                    vec[self.words[word]] += 1
            # After iterating over all words we now have the tf and can store words in tfidf format
            for word in self.words.keys():
                index = self.words[word]
                if vec[index] == 0:
                    continue
                else:
                    vec[index] *= math.log((self.number_of_docs / self.df[word]), 10)
            doc_set.append(vec)

            if debug:
                vec_labels = len(self.labels) * [0]

                for label in article["labels"]:
                    vec_labels[self.labels[label]] = 1
                labels_set.append(vec_labels)

        if debug:
            return np.array(doc_set), np.array(labels_set)

        return np.array(doc_set)