Text-chunking-based-on-GP/opt_gp.py at master · Alwaysproblem/Text-chunking-based-on-GP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import glob
import os
import zipfile

import gpflow
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix as csrmat, vstack

train_zip_path = 'conll_train.zip'
dev_zip_path = 'conll_dev.zip'

def Load_Data_zip(FileName, Dim, sample_rate = 0.1):
    X_data = list()
    y_data = list()
    with zipfile.ZipFile(FileName) as train_zip:
        for fname in train_zip.namelist():
            if '.' in fname:
                with train_zip.open(fname) as f:
                    if fname[-1] == "x":
                        X_data.append(load_x(f, Dim))
                    elif fname[-1] == "y":
                        y_data.append(load_y(f))

    sample_num = round(sample_rate * len(X_data))
    sample_index = np.random.choice(range(len(X_data)), size=sample_num, replace=False)

    return vstack(X_data, dtype=np.float), np.concatenate(y_data, axis=0)

def load_x(file, Dim):
    X_form = np.loadtxt(file, dtype=np.int32)
    row_num = np.max(X_form[:, 0])
    Sparse = csrmat((X_form[:, 2], (X_form[:, 0] - 1, X_form[:, 1] - 1)), shape=(row_num, Dim), dtype=np.float)
    return Sparse

def load_y(file):
    y_form = np.loadtxt(file)
    return np.mat(y_form).T

def sample_from_data(X_data, y_data, sample_rate = 0.1):
    sample_num = round(sample_rate * len(X_data))
    sample_index = np.random.choice(range(len(X_data)), size=sample_num, replace=False)
    return X_data[sample_index, :], y_data[sample_index, :]

def softmax_classfier(input_data, label, Comp_dims, class_num, X_test, y_test):

    input_in = tf.placeholder(tf.float32, shape=(None, Comp_dims), name="input_data")
    lab = tf.placeholder(tf.int32, shape=(None,), name="label")

    pred = tf.layers.dense(input_in, class_num, activation=tf.nn.softmax)
    y_label = tf.one_hot(lab, class_num)

    cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
                        logits = pred,
                        labels = y_label
                    )
    loss = tf.reduce_mean(cross_entropy, name = "loss")
    opt = tf.train.AdamOptimizer(0.1).minimize(loss)
    acc = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(pred, 1), tf.argmax(y_label, 1)), dtype = tf.float32), name = "accuracy")

    init = tf.global_variables_initializer()

    with tf.Session() as s:
        s.run(init)
        s.run(opt, feed_dict={input_in: input_data, lab: label})
        train_acc = s.run(acc, feed_dict={input_in: input_data, lab: label})
        test_acc = s.run(acc, feed_dict={input_in: X_test, lab: y_test})

    return train_acc, test_acc


def SVGP(X, y, X_test, y_test, C_num, start = 1, num_inducing = 1500):
    """
    the X should like: (batch_size, dims)
    the y should like: (batch_size, 1) and start with 0 not 1
    """
    dims = X.shape[1]
    y = y - start

    num_inducing = num_inducing

    from scipy.cluster.vq import kmeans
    Z = kmeans(X, num_inducing)[0]

    SVGP = gpflow.models.SVGP(
        X, y,
        kern=gpflow.kernels.RBF(dims) + gpflow.kernels.White(dims, variance = 0.01),
        Z=Z,
        likelihood=gpflow.likelihoods.MultiClass(C_num),
        num_latent=C_num,
        whiten=True,
        q_diag=True
    )

    gpflow.train.ScipyOptimizer().minimize(SVGP)

    p_train, _ = SVGP.predict_y(X)

    p_test, _ = SVGP.predict_y(X_test)

    train_pred = np.argmax(p_train, axis=1) + start
    test_pred = np.argmax(p_test, axis=1) + start

    train_acc = accuracy_score(y, train_pred)
    test_acc = accuracy_score(y_test, test_pred)

    return train_acc, test_acc


def main():

    D = 2035523
    # define the dimentional of give example sparse matrix.

    Comp_dims = 200
    #define the dimentional of compressed matrix.

    C = 23

    print("Loading data from the zip files....")
    X_train, y_train = Load_Data_zip(train_zip_path, D)
    X_dev, y_dev = Load_Data_zip(dev_zip_path, D)

    print("compress the training data and dev data with TruncatedSVD.")
    svd = TruncatedSVD(Comp_dims, n_iter=100)
    svd.fit(X_train)
    x_train_svd = np.mat(svd.transform(X_train))
    x_dev_svd = np.mat(svd.transform(X_dev))

    print("Sampling from the Cmompressed Data")
    x_train, y_train = sample_from_data(x_train_svd, y_train, sample_rate=0.05)
    print(f"the size of training set is {len(x_train)}")
    x_dev, y_dev = sample_from_data(x_dev_svd, y_dev, sample_rate=0.1)
    print(f"the size of dev set is {len(x_dev)}")

    print("start training...\n")
    train_acc, dev_acc = SVGP(x_train, y_train, x_dev, y_dev, C, 0, num_inducing = 2000)
    print("end.")

    print(f"the train accuracy is {train_acc * 100}%.")
    print(f"the cross validation accuracy is {dev_acc * 100}%.")

    # train_acc, dev_acc = softmax_classfier(X_train, y_train, Comp_dims, C, X_dev, y_dev)


if __name__ == '__main__':
    main()