tensor_decomp_embedding/embedding_comparison.py at master · popcorncolonel/tensor_decomp_embedding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
import dill
from embedding_benchmarks.scripts.web.embeddings import load_embedding
from embedding_benchmarks.scripts.web.evaluate import evaluate_on_all
import heapq
import numpy as np
import pandas as pd
import random
import sys
import time

from embedding_evaluation import EmbeddingTaskEvaluator, evaluate_vectors_from_path

class EmbeddingComparison(object):
    def __init__(self, num_sents, min_count, methods, comparison_name, embedding_dim=None, embedding_dim_list=None, normalize=True, fname=None):
        '''
        `methods` is a list of strings
        `num_sents`, `embedding_dim`, and `min_count` are passed in uniformly for a more fair comparison (we should be evaluating on the exact same test suite, the
            vocabulary of which is determined by those parameters)
        '''
        self.evaluators = []
        num_sents = int(num_sents)
        min_count = int(min_count)
        if embedding_dim is not None:
            embedding_dim = int(embedding_dim)
            embedding_dim_list = [embedding_dim for _ in methods]
        self.methods = methods
        self.num_sents = num_sents
        self.min_count = min_count
        self.embedding_dim_list = embedding_dim_list
        self.comparison_name = comparison_name

        diff_dims = len(set(embedding_dim_list)) != 1  # if we are comparing embeddings of multiple dimensions, add the dimension to the method (for keeping track)
        for method, dim in zip(methods, embedding_dim_list):
            if method == 'word2vec':
                fname = '../word2vec.txt'
            else:
                fname = 'runs/{}/{}_{}_{}/vectors.txt'.format(method, num_sents, min_count, dim)
            if diff_dims:
                method += '_{}'.format(dim)
            self.evaluators.append(EmbeddingTaskEvaluator(method=method, fname=fname, normalize_vects=normalize))
        print('intersecting vocabs...')
        vocab_sets = [set(evaluator.embedding_dict.keys()) for evaluator in self.evaluators]
        self.vocab_set = set.intersection(*vocab_sets)
        print('intersected vocab len: {}'.format(len(self.vocab_set)))
        print('intersecting embedding dicts...')
        for evaluator in self.evaluators:
            d = evaluator.embedding_dict
            keys = [x for x in d.keys()]
            for k in keys:
                if k not in self.vocab_set:
                    d.pop(k, None)
        print('done initializing!')

    def print_method(self, method):
        print()
        print("===={}====".format(method))

    def compare_word_dimensions(self, words):
        print("\n==================================")
        '''
        Qualitative evaluation. gets the top 5 dimensions of each word in `words`, then prints the top 3 words in those dimensions.
        `words` is a list of strings
        '''
        for evaluator in self.evaluators:
            self.print_method(evaluator.method)
            for word in words:
                vec = evaluator.embedding_dict[word]
                k = 3
                n = 2
                top_n_dims = vec.argsort()[-n:][::-1]
                print('Word: {}'.format(word))
                for i in range(n):
                    print('top words in dimension {}: {}'.format(top_n_dims[i], ','.join([w for (w,v) in heapq.nlargest(k+1, evaluator.embedding_dict.items(), key=lambda x: x[1][top_n_dims[i]]) if w != word][:k])))

    def compare_nearest_neighbors(self, words):
        print("\n==================================")
        '''
        Qualitative evaluation. Prints the nearest vectors to each word in `words`
        `words` is a list of strings
        '''
        def closest_neighbors(evaluator, word, n=5):
            embedding_dict = evaluator.embedding_dict
            cos_sims = []
            if word not in embedding_dict:
                print('{} not in vocab'.format(word))
                return []
            vec = embedding_dict[word]
            for word2, vec2 in embedding_dict.items():
                if word == word2:
                    continue
                cos_sim = np.dot(vec, vec2) / (np.linalg.norm(vec) * np.linalg.norm(vec2))
                cos_sims.append((cos_sim, word2))
            nlargest = heapq.nlargest(n, cos_sims)
            nlargest = [word for (sim, word) in nlargest]
            return nlargest

        for evaluator in self.evaluators:
            self.print_method(evaluator.method)
            for word in words:
                nlargest = closest_neighbors(evaluator, word)
                if not nlargest:
                    continue
                print('Closest words to {}: {}'.format(word, ', '.join(nlargest)))

    def compare_web(self, normalize=True):
        print("\n==================================")
        frames = []
        for evaluator in self.evaluators:
            self.print_method(evaluator.method)
            vecpath = evaluator.fname
            w = load_embedding(vecpath, format='word2vec', normalize=normalize, lower=True, clean_words=False, load_kwargs={})
            results = evaluate_on_all(w, categorization=False)
            results.index = [evaluator.method]
            frames.append(results)
            print(results)
        all_results = pd.concat(frames)
        return all_results

    def compare_word_classification(self, train_pct=1.0):
        print("\n==================================")
        score_dict = {}
        for evaluator in self.evaluators:
            self.print_method(evaluator.method)
            score = evaluator.word_classification_tasks(classification_problem='PoS', train_pct=train_pct)
            print("PoS classification ({}) score: {}".format(int(train_pct*100), score))
            method = evaluator.method
            score_dict[method] = score
        return score_dict

    def compare_sentiment_analysis(self, train_pct=1.0):
        print("\n==================================")
        score_dict = {}
        for evaluator in self.evaluators:
            self.print_method(evaluator.method)
            score = evaluator.sentiment_analysis_tasks(train_pct=train_pct)
            print("Sentiment classification ({}%) score: {}".format(train_pct*100.0, score))
            method = evaluator.method
            score_dict[method] = score
        return score_dict

    def compare_analogy(self, train_pct, iter_pct=1.0, is_sem_only=False, reg_param=0.001, regularize_all=False,
                        multiplicative=False):
        print("\n==================================")
        sem_dict = {}
        syn_dict = {}
        for evaluator in self.evaluators:
            self.print_method(evaluator.method)
            (sem_score, syn_score) = evaluator.analogy_tasks(train_pct=train_pct,
                iter_pct=iter_pct,
                is_sem_only=is_sem_only,
                reg_param=reg_param,
                regularize_all=regularize_all,
                multiplicative=multiplicative,
            )
            print("Analogy sem/syn scores: {}".format((sem_score, syn_score)))
            method = evaluator.method
            sem_dict[method] = sem_score
            syn_dict[method] = syn_score
        return sem_dict, syn_dict

    def compare_outlier_detection(self, n=3):
        print("\n==================================")
        opp_dict = {}
        acc_dict = {}
        print("COMPARING OUTLIER DETECTION WITH N={}".format(n))
        for evaluator in self.evaluators:
            method = evaluator.method
            opp, accuracy = evaluator.outlier_detection(verbose=False, n=n)
            self.print_method(evaluator.method)
            print("OD{} OPP: {}".format(n, opp))
            print("OD{} accuracy: {}".format(n, accuracy))
            opp_dict[method] = opp / 100.0
            acc_dict[method] = accuracy / 100.0
        return opp_dict, acc_dict

    def compare_coherency(self, n):
        def get_outlier(evaluator, dim):
            ''' Returns a word in the bottom half of this dimension that is also in the top 10% of another dimension. '''
            embedding_dict = evaluator.embedding_dict
            num_words = len(embedding_dict)
            dim_values = []
            for word, vec in embedding_dict.items():
                dim_values.append((vec[dim], word))
            bottom_n = heapq.nlargest(num_words // 2, dim_values, key=lambda x: -x[0])  # bottom n words
            while True:
                rand_word = random.choice(bottom_n)[1]
                for dim in range(evaluator.embedding_dim):
                    dim_values = []
                    for word, vec in embedding_dict.items():
                        dim_values.append((vec[dim], word))
                    top_10pct_for_dim = heapq.nlargest(num_words // 10, dim_values)
                    if rand_word in [word for (val, word) in top_10pct_for_dim]:
                        return rand_word

        def top_n_words_for_dim(evaluator, dim, n=5):
            embedding_dict = evaluator.embedding_dict
            dim_values = []
            for word, vec in embedding_dict.items():
                dim_values.append((vec[dim], word))
            topn = heapq.nlargest(n, dim_values)
            return [word for (val, word) in topn]

        for evaluator in self.evaluators:
            rand_dim = np.random.randint(0, evaluator.embedding_dim)
            self.print_method(evaluator.method)
            print("Highest words in dim {}: {}, with outlier '{}'".format(
                rand_dim,
                top_n_words_for_dim(evaluator, rand_dim, n=n),
                get_outlier(evaluator, rand_dim),
            ))

    def compare_all(self, num_runs=1):
        all_dfs = []  # allow for random resets
        for run_index in range(num_runs):
            for evaluator in self.evaluators:
                evaluator.seed_bump = run_index

            print('qualitative:')
            self.compare_coherency(n=3)
            words = random.sample(list(self.vocab_set), 5)
            #self.compare_word_dimensions(words)
            self.compare_nearest_neighbors(words)

            print('quantitative:')
            # this should be fastest to slowest
            sentiment_analysis_10_results = self.compare_sentiment_analysis(train_pct=.1)
            sentiment_analysis_30_results = self.compare_sentiment_analysis(train_pct=.3)
            sentiment_analysis_50_results = self.compare_sentiment_analysis(train_pct=.5)
            sentiment_analysis_results = self.compare_sentiment_analysis()

            word_class_10_results = self.compare_word_classification(train_pct=.1)
            word_class_30_results = self.compare_word_classification(train_pct=.3)
            word_class_50_results = self.compare_word_classification(train_pct=.5)
            word_class_results = self.compare_word_classification(train_pct=1.0)

            outlier_det2_opps, outlier_det2_accs = self.compare_outlier_detection(n=2)
            outlier_det3_opps, outlier_det3_accs = self.compare_outlier_detection(n=3)

            #analogy_sem_results_10, analogy_syn_results_10 = self.compare_analogy(.1)
            #analogy_sem_results_30, analogy_syn_results_30 = self.compare_analogy(.3)
            #analogy_sem_results_50, analogy_syn_results_50 = self.compare_analogy(.5)
            #analogy_sem_results, analogy_syn_results = self.compare_analogy(1.0)

            result_name_pairs = [
                #(analogy_sem_results_10, 'Analogy 10% (sem)'),
                #(analogy_sem_results_30, 'Analogy 30% (sem)'),
                #(analogy_sem_results_50, 'Analogy 50% (sem)'),
                #(analogy_sem_results, 'Analogy 100% (sem)'),

                #(analogy_syn_results_10, 'Analogy 10% (syn)'),
                #(analogy_syn_results_30, 'Analogy 30% (syn)'),
                #(analogy_syn_results_50, 'Analogy 50% (syn)'),
                #(analogy_syn_results, 'Analogy 100% (syn)'),

                (sentiment_analysis_10_results, 'Sentiment analysis (10%)'),
                (sentiment_analysis_30_results, 'Sentiment analysis (30%)'),
                (sentiment_analysis_50_results, 'Sentiment analysis (50%)'),
                (sentiment_analysis_results, 'Sentiment analysis (100%)'),

                (word_class_10_results, 'PoS classification (10%)'),
                (word_class_30_results, 'PoS classification (30%)'),
                (word_class_50_results, 'PoS classification (50%)'),
                (word_class_results, 'PoS classification (100%)'),

                (outlier_det2_opps, 'OD2 OPP'),
                (outlier_det2_accs, 'OD2 acc'),
                (outlier_det3_opps, 'OD3 OPP'),
                (outlier_det3_accs, 'OD3 acc'),
            ]
            df = pd.DataFrame([ d for (d, name) in result_name_pairs ])
            df.index = [ name for (d, name) in result_name_pairs ]
            df = df.transpose()

            if True:
                web_results = self.compare_web()
                all_df = web_results.join(df)
                all_df = all_df.transpose()  # excel likes it better this way
            else:
                all_df = df.transpose()

            all_df = all_df[self.methods]  # Reorder columns to be in initial order
            all_dfs.append(all_df)

        df_sum = all_dfs[0]
        for df in all_dfs[1:]:
            df_sum += df
        avg_df = (1. / num_runs) * df_sum
        # write to excel file
        excel_fname = 'comparison_{}_{}_{}.xlsx'.format(self.num_sents, self.min_count, self.comparison_name)
        writer = pd.ExcelWriter(excel_fname)
        avg_df.to_excel(writer)
        writer.save()
        print(avg_df)
        print('Saved to {}'.format(excel_fname))
        print('Num runs: {}'.format(num_runs))
        import pdb; pdb.set_trace()
        print(np.std(all_dfs, axis=2))

        return avg_df


if __name__ == '__main__':
    if len(sys.argv) == 1:
        raise ValueError('Please specify the name of the comparison')
    comparison_name = sys.argv[1]
    embedding_dim = None
    embedding_dim_list = None
    min_count = 1000
    if comparison_name == '1e5':
        num_sents = int(float(comparison_name))
        methods = ['random', 'cbow', 'sgns', 'glove_sym', 'nnse', 'cp-s', 'jcp-s', 'cp-s_best']
        embedding_dim = 300
    elif comparison_name == 'web':
        num_sents = int(1e5)
        methods = ['random', 'cbow', 'sgns', 'glove', 'nnse', 'cp-s', 'jcp-s']
        embedding_dim = 300
    elif comparison_name == 'glove':
        num_sents = int(1e5)
        methods = ['random', 'glove', 'glove_sym']
        embedding_dim = 300
    elif comparison_name == 'hosg':
        num_sents = int(1e5)
        methods = ['random', 'cp-s', 'jcp-s', 'hosg']
        embedding_dim = 300
    elif comparison_name == 'learn_nn_hparams':
        num_sents = int(1e5)
        methods = ['random', 'glove', 'nnse', 'cp-s', 'jcp-s']
        embedding_dim = 300
    comparator = EmbeddingComparison(
        methods=methods,
        num_sents=num_sents,
        min_count=min_count,
        embedding_dim=embedding_dim,
        embedding_dim_list=embedding_dim_list,
        comparison_name=comparison_name,
    )

    if comparison_name == 'web':
        comparator.compare_web(normalize=True)
    elif comparison_name != 'learn_nn_hparams':
        comparator.compare_all(num_runs=10)
    else:
        t = time.time()
        results_dict = dict()
        n_random_trials = 1
        def loguniform(low=1e-6, high=1, size=None):
            return np.exp(np.random.uniform(np.log(low), np.log(high), size))
        for _ in range(10):
            reg_param = loguniform(.001, 0.01)  # regulate W3
            #reg_param = 0.0030505989722323123  # pretty high reg param
            iter_pct = int(np.random.uniform(2, 10))
            for regularize_all in [False]:
                multiplicative = True
                mean_scores = {method: np.array([0., 0., 0., 0.], dtype=np.float64) for method in methods}
                for _ in range(n_random_trials):
                    for evaluator in comparator.evaluators:
                        evaluator.seed_bump += 1
                    kwargs = dict(
                            is_sem_only=False,
                            iter_pct=iter_pct,
                            reg_param=reg_param,
                            regularize_all=regularize_all,
                            multiplicative=multiplicative,
                    )
                    scores1 = comparator.compare_analogy(0.1, **kwargs)[0]
                    scores2 = comparator.compare_analogy(0.3, **kwargs)[0]
                    scores3 = comparator.compare_analogy(0.5, **kwargs)[0]
                    scores5 = comparator.compare_analogy(1.0, **kwargs)[0]
                    for method in methods:
                        mean_scores[method] += np.array([
                            scores1[method], scores2[method], scores3[method], scores5[method]
                        ]) / n_random_trials
                for method in methods:
                    mean_scores[method] = ["{:.4f}".format(x) for x in mean_scores[method]]
                results_dict[(iter_pct, regularize_all, reg_param)] = mean_scores
                with open('hparams/{}_{}_{:.4f}_multiplicative.txt'.format(iter_pct, regularize_all, reg_param), 'a') as f:
                    for k in mean_scores:
                        print("{}: {}\n".format(k, mean_scores[k]), file=f)
            print('time so far: {}'.format(time.time() - t))
        for k in results_dict:
            print("{}: {}".format(k, results_dict[k]))
        print('welp. that took {:.4f} seconds'.format(time.time() - t))
        print("with biases and with no tanh, {} random restarts".format(n_random_trials))
        print("high iter_pct's, fixed biases")
        print("MULTIPLICATIVE")
        import pdb; pdb.set_trace()
        sys.exit()