用keras進(jìn)行大數(shù)據(jù)訓(xùn)練,為了加快訓(xùn)練,需要提前制作訓(xùn)練集。
由于HDF5的特性,所有數(shù)據(jù)需要一次性讀入到內(nèi)存中,才能保存。
為此,我采用分批次分為2個(gè)以上HDF5進(jìn)行存儲。
1、先讀取每個(gè)標(biāo)簽下的圖片,并設(shè)置標(biāo)簽
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
def load_dataset(path_name,data_path): images = [] labels = [] train_images = [] valid_images = [] train_labels = [] valid_labels = [] counter = 0 allpath = os.listdir(path_name) nb_classes = len (allpath) print ( "label_num: " ,nb_classes) for child_dir in allpath: child_path = os.path.join(path_name, child_dir) for dir_image in os.listdir(child_path): if dir_image.endswith( '.jpg' ): img = cv2.imread(os.path.join(child_path, dir_image)) image = misc.imresize(img, (IMAGE_SIZE, IMAGE_SIZE), interp = 'bilinear' ) #resized_img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE)) images.append(image) labels.append(counter) |
2、該標(biāo)簽下的數(shù)據(jù)集分割為訓(xùn)練集(train images),驗(yàn)證集(val images),訓(xùn)練標(biāo)簽(train labels),驗(yàn)證標(biāo)簽
(val labels)
1
2
3
4
5
6
7
8
|
def split_dataset(images, labels): train_images, valid_images, train_labels, valid_labels = train_test_split(images,\ labels, test_size = 0.2 , random_state = random.randint( 0 , 100 )) #print(train_images.shape[0], 'train samples') #print(valid_images.shape[0], 'valid samples') return train_images, valid_images, train_labels ,valid_labels |
3、分割后的數(shù)據(jù)分別添加到總的訓(xùn)練集,驗(yàn)證集,訓(xùn)練標(biāo)簽,驗(yàn)證標(biāo)簽。
其次,清空原有的圖片集和標(biāo)簽集,目的是節(jié)省內(nèi)存。假如一次性讀入多個(gè)標(biāo)簽的數(shù)據(jù)集與標(biāo)簽集,進(jìn)行數(shù)據(jù)分割后,會占用大于單純進(jìn)行上述操作兩倍以上的內(nèi)存。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
images = np.array(images) t_images, v_images, t_labels ,v_labels = split_dataset(images, labels) for i in range ( len (t_images)): train_images.append(t_images[i]) train_labels.append(t_labels[i]) for j in range ( len (v_images)): valid_images.append(v_images[j]) valid_labels.append(v_labels[j]) if counter % 50 = = 49 : print ( counter + 1 , "is read to the memory!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ) images = [] labels = [] counter = counter + 1 print ( "train_images num: " , len (train_images), " " , "valid_images num: " , len (valid_images)) |
4、進(jìn)行判斷,直到讀到自己自己分割的那個(gè)標(biāo)簽。
開始進(jìn)行寫入。寫入之前,為了更好地訓(xùn)練模型,需要把對應(yīng)的圖片集和標(biāo)簽打亂順序。
1
2
3
4
5
6
7
8
|
if ((counter % 4316 = = 4315 ) or (counter = = nb_classes - 1 )): print ( "start write images and labels data..................................................................." ) num = counter / / 5000 dirs = data_path + "/" + "h5_" + str (num - 1 ) if not os.path.exists(dirs): os.makedirs(dirs) data2h5(dirs, t_images, v_images, t_labels ,v_labels) |
對應(yīng)打亂順序并寫入到HDF5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
def data2h5(dirs_path, train_images, valid_images, train_labels ,valid_labels): TRAIN_HDF5 = dirs_path + '/' + "train.hdf5" VAL_HDF5 = dirs_path + '/' + "val.hdf5" #shuffle state1 = np.random.get_state() np.random.shuffle(train_images) np.random.set_state(state1) np.random.shuffle(train_labels) state2 = np.random.get_state() np.random.shuffle(valid_images) np.random.set_state(state2) np.random.shuffle(valid_labels) datasets = [ ( "train" ,train_images,train_labels,TRAIN_HDF5), ( "val" ,valid_images,valid_labels,VAL_HDF5)] for (dType,images,labels,outputPath) in datasets: # HDF5 initial f = h5py. File (outputPath, "w" ) f.create_dataset( "x_" + dType, data = images) f.create_dataset( "y_" + dType, data = labels) #f.create_dataset("x_"+dType, data=images, compression="gzip", compression_opts=9) #f.create_dataset("y_"+dType, data=labels, compression="gzip", compression_opts=9) f.close() |
5、判斷文件全部讀入
1
2
3
4
5
6
7
8
9
10
11
12
13
|
def read_dataset(dirs): files = os.listdir(dirs) print (files) for file in files: path = dirs + '/' + file dataset = h5py. File (path, "r" ) file = file .split( '.' ) set_x_orig = dataset[ "x_" + file [ 0 ]].shape[ 0 ] set_y_orig = dataset[ "y_" + file [ 0 ]].shape[ 0 ] print (set_x_orig) print (set_y_orig) |
6、訓(xùn)練中,采用迭代器讀入數(shù)據(jù)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
def generator( self , datagen, mode): passes = np.inf aug = ImageDataGenerator( featurewise_center = False , samplewise_center = False , featurewise_std_normalization = False , samplewise_std_normalization = False , zca_whitening = False , rotation_range = 20 , width_shift_range = 0.2 , height_shift_range = 0.2 , horizontal_flip = True , vertical_flip = False ) epochs = 0 # 默認(rèn)是無限循環(huán)遍歷 while epochs < passes: # 遍歷數(shù)據(jù) file_dir = os.listdir( self .data_path) for file in file_dir: #print(file) file_path = os.path.join( self .data_path, file ) TRAIN_HDF5 = file_path + "/train.hdf5" VAL_HDF5 = file_path + "/val.hdf5" #TEST_HDF5 = file_path +"/test.hdf5" db_t = h5py. File (TRAIN_HDF5) numImages_t = db_t[ 'y_train' ].shape[ 0 ] db_v = h5py. File (VAL_HDF5) numImages_v = db_v[ 'y_val' ].shape[ 0 ] if mode = = "train" : for i in np.arange( 0 , numImages_t, self .BS): images = db_t[ 'x_train' ][i: i + self .BS] labels = db_t[ 'y_train' ][i: i + self .BS] if K.image_data_format() = = 'channels_first' : images = images.reshape(images.shape[ 0 ], 3 , IMAGE_SIZE,IMAGE_SIZE) else : images = images.reshape(images.shape[ 0 ], IMAGE_SIZE, IMAGE_SIZE, 3 ) images = images.astype( 'float32' ) images = images / 255 if datagen : (images,labels) = next (aug.flow(images,labels,batch_size = self .BS)) # one-hot編碼 if self .binarize: labels = np_utils.to_categorical(labels, self .classes) yield ({ 'input_1' : images}, { 'softmax' : labels}) elif mode = = "val" : for i in np.arange( 0 , numImages_v, self .BS): images = db_v[ 'x_val' ][i: i + self .BS] labels = db_v[ 'y_val' ][i: i + self .BS] if K.image_data_format() = = 'channels_first' : images = images.reshape(images.shape[ 0 ], 3 , IMAGE_SIZE,IMAGE_SIZE) else : images = images.reshape(images.shape[ 0 ], IMAGE_SIZE, IMAGE_SIZE, 3 ) images = images.astype( 'float32' ) images = images / 255 if datagen : (images,labels) = next (aug.flow(images,labels,batch_size = self .BS)) #one-hot編碼 if self .binarize: labels = np_utils.to_categorical(labels, self .classes) yield ({ 'input_1' : images}, { 'softmax' : labels}) epochs + = 1 |
7、至此,就大功告成了
完整的代碼:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
# -*- coding: utf-8 -*- """ Created on Mon Feb 12 20:46:12 2018 @author: william_yue """ import os import numpy as np import cv2 import random from scipy import misc import h5py from sklearn.model_selection import train_test_split from keras import backend as K K.clear_session() from keras.utils import np_utils IMAGE_SIZE = 128 # 加載數(shù)據(jù)集并按照交叉驗(yàn)證的原則劃分?jǐn)?shù)據(jù)集并進(jìn)行相關(guān)預(yù)處理工作 def split_dataset(images, labels): # 導(dǎo)入了sklearn庫的交叉驗(yàn)證模塊,利用函數(shù)train_test_split()來劃分訓(xùn)練集和驗(yàn)證集 # 劃分出了20%的數(shù)據(jù)用于驗(yàn)證,80%用于訓(xùn)練模型 train_images, valid_images, train_labels, valid_labels = train_test_split(images,\ labels, test_size = 0.2 , random_state = random.randint( 0 , 100 )) return train_images, valid_images, train_labels ,valid_labels def data2h5(dirs_path, train_images, valid_images, train_labels ,valid_labels): #def data2h5(dirs_path, train_images, valid_images, test_images, train_labels ,valid_labels, test_labels): TRAIN_HDF5 = dirs_path + '/' + "train.hdf5" VAL_HDF5 = dirs_path + '/' + "val.hdf5" #采用標(biāo)簽與圖片相同的順序分別打亂訓(xùn)練集與驗(yàn)證集 state1 = np.random.get_state() np.random.shuffle(train_images) np.random.set_state(state1) np.random.shuffle(train_labels) state2 = np.random.get_state() np.random.shuffle(valid_images) np.random.set_state(state2) np.random.shuffle(valid_labels) datasets = [ ( "train" ,train_images,train_labels,TRAIN_HDF5), ( "val" ,valid_images,valid_labels,VAL_HDF5)] for (dType,images,labels,outputPath) in datasets: # 初始化HDF5寫入 f = h5py. File (outputPath, "w" ) f.create_dataset( "x_" + dType, data = images) f.create_dataset( "y_" + dType, data = labels) #f.create_dataset("x_"+dType, data=images, compression="gzip", compression_opts=9) #f.create_dataset("y_"+dType, data=labels, compression="gzip", compression_opts=9) f.close() def read_dataset(dirs): files = os.listdir(dirs) print (files) for file in files: path = dirs + '/' + file file_read = os.listdir(path) for i in file_read: path_read = os.path.join(path, i) dataset = h5py. File (path_read, "r" ) i = i.split( '.' ) set_x_orig = dataset[ "x_" + i[ 0 ]].shape[ 0 ] set_y_orig = dataset[ "y_" + i[ 0 ]].shape[ 0 ] print (set_x_orig) print (set_y_orig) #循環(huán)讀取每個(gè)標(biāo)簽集下的所有圖片 def load_dataset(path_name,data_path): images = [] labels = [] train_images = [] valid_images = [] train_labels = [] valid_labels = [] counter = 0 allpath = os.listdir(path_name) nb_classes = len (allpath) print ( "label_num: " ,nb_classes) for child_dir in allpath: child_path = os.path.join(path_name, child_dir) for dir_image in os.listdir(child_path): if dir_image.endswith( '.jpg' ): img = cv2.imread(os.path.join(child_path, dir_image)) image = misc.imresize(img, (IMAGE_SIZE, IMAGE_SIZE), interp = 'bilinear' ) #resized_img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE)) images.append(image) labels.append(counter) images = np.array(images) t_images, v_images, t_labels ,v_labels = split_dataset(images, labels) for i in range ( len (t_images)): train_images.append(t_images[i]) train_labels.append(t_labels[i]) for j in range ( len (v_images)): valid_images.append(v_images[j]) valid_labels.append(v_labels[j]) if counter % 50 = = 49 : print ( counter + 1 , "is read to the memory!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ) images = [] labels = [] if ((counter % 4316 = = 4315 ) or (counter = = nb_classes - 1 )): print ( "train_images num: " , len (train_images), " " , "valid_images num: " , len (valid_images)) print ( "start write images and labels data..................................................................." ) num = counter / / 5000 dirs = data_path + "/" + "h5_" + str (num - 1 ) if not os.path.exists(dirs): os.makedirs(dirs) data2h5(dirs, train_images, valid_images, train_labels ,valid_labels) #read_dataset(dirs) print ( "File HDF5_%d " % num, " id done!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ) train_images = [] valid_images = [] train_labels = [] valid_labels = [] counter = counter + 1 print ( "All File HDF5 done!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ) read_dataset(data_path) #讀取訓(xùn)練數(shù)據(jù)集的文件夾,把他們的名字返回給一個(gè)list def read_name_list(path_name): name_list = [] for child_dir in os.listdir(path_name): name_list.append(child_dir) return name_list if __name__ = = '__main__' : path = "data" data_path = "data_hdf5_half" if not os.path.exists(data_path): os.makedirs(data_path) load_dataset(path,data_path) |
以上這篇完美解決keras 讀取多個(gè)hdf5文件進(jìn)行訓(xùn)練的問題就是小編分享給大家的全部內(nèi)容了,希望能給大家一個(gè)參考,也希望大家多多支持服務(wù)器之家。
原文鏈接:https://blog.csdn.net/u010847579/article/details/89562256