-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path0727_data_general.py
More file actions
81 lines (47 loc) · 2.07 KB
/
0727_data_general.py
File metadata and controls
81 lines (47 loc) · 2.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pandas as pd
import random
train=pd.read_csv('../train.csv')
test=pd.read_csv('../test.csv')
sample_submission=pd.read_csv('../sample_submission.csv')
james = train['label']
james_value_counts = james.value_counts()
train=train[['과제명', '요약문_연구목표','요약문_연구내용', 'label']]
test=test[['과제명', '요약문_연구목표','요약문_연구내용']]
train['요약문_연구내용'].fillna('NAN', inplace=True)
test['요약문_연구내용'].fillna('NAN', inplace=True)
train['요약문_연구목표'].fillna('NAN', inplace=True)
test['요약문_연구목표'].fillna('NAN', inplace=True)
train['data']=train['과제명']+train['요약문_연구내용']+train['요약문_연구목표']
test['data']=test['과제명']+test['요약문_연구내용']+test['요약문_연구목표']
train=train[['data','label']]
test=test['data']
# label 0 5천개
sample_0 = []
cnt = 0
for i in range(len(train)):
if train.iloc[i].label == 0:
sample_0.append(train.iloc[i].tolist())
cnt = cnt + 1
if cnt >= 3000: break
sample = []
#0~45
for i in range(46):
sample.append([])
for i in range(len(train)):
sample[train.iloc[i].label].append(train.iloc[i].tolist())
#각 label별 곱셈 가중치 -> notion / research/ DACON / Action item/ 데이터 분석
multiple_weight=[1,3,13,25,60,2,40,30,10,34,5,17,13,6,2,20,6,25,2,1,3,7,25,2,1,4,11,6,8,3,14,4,13,6,6,12,3,13,13,17,8,50,70,10,35,3]
for i in range(len(multiple_weight)):
sample[i] = sample[i] * multiple_weight[i]
train_over_sample = []
train_over_sample = train_over_sample + sample_0
for i in range(45):
train_over_sample = train_over_sample + sample[i+1]
tris = train_over_sample
random.shuffle(train_over_sample)
apeach = pd.DataFrame(tris)
ryan = pd.DataFrame(train_over_sample)
ryan.columns = ['data', 'label']
apeach.columns = ['data', 'label']
ryan.to_csv("balance_dataset_shuffle.csv",encoding="utf-8-sig")
apeach.to_csv("balance_dataset.csv",encoding="utf-8-sig")