-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathdata_deal.py
108 lines (83 loc) · 2.52 KB
/
data_deal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import cv2 as cv
import os
import numpy as np
import random
import pickle
import time
start_time = time.time()
data_dir = './data'
batch_save_path = './batch_files'
# 创建batch文件存储的文件夹
os.makedirs(batch_save_path, exist_ok=True)
# 图片统一大小:100 * 100
# 训练集 20000:100个batch文件,每个文件200张图片
# 验证集 5000: 一个测试文件,测试时 50张 x 100 批次
# 进入图片数据的目录,读取图片信息
all_data_files = os.listdir(os.path.join(data_dir, 'train/'))
# print(all_data_files)
# 打算数据的顺序
random.shuffle(all_data_files)
all_train_files = all_data_files[:20000]
all_test_files = all_data_files[20000:]
train_data = []
train_label = []
train_filenames = []
test_data = []
test_label = []
test_filenames = []
# 训练集
for each in all_train_files:
img = cv.imread(os.path.join(data_dir,'train/',each),1)
resized_img = cv.resize(img, (100,100))
img_data = np.array(resized_img)
train_data.append(img_data)
if 'cat' in each:
train_label.append(0)
elif 'dog' in each:
train_label.append(1)
else:
raise Exception('%s is wrong train file'%(each))
train_filenames.append(each)
# 测试集
for each in all_test_files:
img = cv.imread(os.path.join(data_dir,'train/',each), 1)
resized_img = cv.resize(img, (100,100))
img_data = np.array(resized_img)
test_data.append(img_data)
if 'cat' in each:
test_label.append(0)
elif 'dog' in each:
test_label.append(1)
else:
raise Exception('%s is wrong test file'%(each))
test_filenames.append(each)
print(len(train_data), len(test_data))
# 制作100个batch文件
start = 0
end = 200
for num in range(1, 101):
batch_data = train_data[start: end]
batch_label = train_label[start: end]
batch_filenames = train_filenames[start: end]
batch_name = 'training batch {} of 15'.format(num)
all_data = {
'data':batch_data,
'label':batch_label,
'filenames':batch_filenames,
'name':batch_name
}
with open(os.path.join(batch_save_path, 'train_batch_{}'.format(num)), 'wb') as f:
pickle.dump(all_data, f)
start += 200
end += 200
# 制作测试文件
all_test_data = {
'data':test_data,
'label':test_label,
'filenames':test_filenames,
'name':'test batch 1 of 1'
}
with open(os.path.join(batch_save_path, 'test_batch'), 'wb') as f:
pickle.dump(all_test_data, f)
end_time = time.time()
print('制作结束, 用时{}秒'.format(end_time - start_time))