- 投稿日:2020-07-02T21:31:52+09:00
Light CNNの実装(Python Keras)
はじめに
本記事ではLight CNN(LCNN)という深層学習モデルを実装したのでまとめさせていただきました。
最初にLCNNとその特徴であるMax Feature Mapping (MFM)という技術について説明した後、実装と評価をしていきます。
コードは全てpython、LCNNの実装はTensorflow, Kerasを使って行います。
なお、LCNNを実装するためのコードはGithubに載せてるので参考にしてください。
Github URL : https://github.com/ozora-ogino/LCNNLight CNN
LCNNは2015年にSTC提案され、現在はSTCという機関により研究されている深層学習手法で画像分類や音声分類等の分野で使用されています。
LCNNは8層の畳み込み層から構成されており、各層における活性化関数でMax Feature Mappingと呼ばれるものを使っているのが大きな特徴となっています。Max Feature Mapping
MFMについてはこちらに詳しくまとめたので参考にしてみてください。
元の論文も載せておきます。
"A Light CNN for Deep Face Representation with Noisy Labels
" (https://arxiv.org/pdf/1511.02683.pdf)実装
GitHubに載せているものと同じコードになります。
lcnn.pyimport tensorflow as tf from keras.layers import Activation, Dense, BatchNormalization, MaxPool2D, Lambda, Input, Flatten, Dropout from keras.layers.convolutional import Conv2D from keras.models import Model from keras.initializers import he_normal #Custom layer from .layers import Maxout #function that return the stuck of Conv2D and MFM def MaxOutConv2D(x, dim, kernel_size, strides, padding='same'): conv_out = Conv2D(dim, kernel_size=kernel_size, strides=strides, padding=padding)(x) mfm_out = Maxout(int(dim/2))(conv_out) return mfm_out #function that return the stuck of FC and MFM def MaxOutDense(x, dim): dense_out = Dense(dim)(x) mfm_out = Maxout(int(dim/2))(dense_out) return mfm_out # this function helps to build LCNN. def build_lcnn(shape, n_label=2): """ Auguments: shape (list) : Input shape for LCNN. (Example : [128, 128, 1]) n_label (int) : Number of label that LCNN should predict. """ input = Input(shape=shape) conv2d_1 = MaxOutConv2D(input, 64, kernel_size=5, strides=1, padding='same') maxpool_1 = MaxPool2D(pool_size=(2, 2), strides=(2,2))(conv2d_1) conv_2d_2 = MaxOutConv2D(maxpool_1, 64, kernel_size=1, strides=1, padding='same') batch_norm_2 = BatchNormalization()(conv_2d_2) conv2d_3 = MaxOutConv2D(batch_norm_2, 96, kernel_size=3, strides=1, padding='same') maxpool_3 = MaxPool2D(pool_size=(2, 2), strides=(2,2))(conv2d_3) batch_norm_3 = BatchNormalization()(maxpool_3) conv_2d_4 = MaxOutConv2D(batch_norm_3, 96, kernel_size=1, strides=1, padding='same') batch_norm_4 = BatchNormalization()(conv_2d_4) conv2d_5 = MaxOutConv2D(batch_norm_4, 128, kernel_size=3, strides=1, padding='same') maxpool_5 = MaxPool2D(pool_size=(2, 2), strides=(2,2))(conv2d_5) conv_2d_6 = MaxOutConv2D(maxpool_5, 128, kernel_size=1, strides=1, padding='same') batch_norm_6 = BatchNormalization()(conv_2d_6) conv_2d_7 = MaxOutConv2D(batch_norm_6, 64, kernel_size=3, strides=1, padding='same') batch_norm_7 = BatchNormalization()(conv_2d_7) conv_2d_8 = MaxOutConv2D(batch_norm_7, 64, kernel_size=1, strides=1, padding='same') batch_norm_8 = BatchNormalization()(conv_2d_8) conv_2d_9 = MaxOutConv2D(batch_norm_8, 64, kernel_size=3, strides=1, padding='same') maxpool_9 = MaxPool2D(pool_size=(2, 2), strides=(2,2))(conv_2d_9) flatten = Flatten()(maxpool_9) dense_10 = MaxOutDense(flatten, 160) batch_norm_10 = BatchNormalization()(dense_10) dropout_10 = Dropout(0.75)(batch_norm_10) output = Dense(n_label, activation='softmax')(dropout_10) return Model(inputs=input, outputs=output)評価
実装したLCNNは音声認識コンペで使用されたモデルになりますが、簡単な画像認識でどの程度の性能が出るのかmnistとCIFAR10で試してみました。
モデルチューニング等は一切していませんがmnistでは99%、CIFAR10では75%程度の性能を示すことができました。mnist
test_mint.pyimport numpy as np from keras.callbacks import EarlyStopping from keras.utils import to_categorical from keras.datasets import mnist lr = 0.001 epochs = 10 batch_size =256 [x_train, y_train], [x_test, y_test] = mnist.load_data() x_train = x_train / 255 x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], x_train.shape[2], 1)) y_train = to_categorical(y_train) input_shape = x_train.shape[1:] lcnn = build_lcnn(input_shape, n_label=10) lcnn.compile(optimizer=Adam(learning_rate=lr), loss='categorical_crossentropy', metrics=['accuracy']) es = EarlyStopping(monitor='val_loss', patience=3, verbose=1) history = lcnn.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, callbacks=[es]) x_test = x_test / 255 x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], x_test.shape[2], 1)) y_test = to_categorical(y_test) loss, acc = lcnn.evaluate(x_test, y_test) print(f'Accuracy : {acc*100}') # Result --> Accuracy : 99.90999794006348 print(f'Loss : {loss}')# Result --> Loss : 0.04250425341885457CIFAR10
test_cifar10.pyimport numpy as np from keras.callbacks import EarlyStopping from keras.utils import to_categorical from keras.datasets import cifar10 lr = 0.001 epochs = 100 batch_size =64 [x_train, y_train], [x_test, y_test] =cifar10.load_data() x_train = x_train / 255 y_train = to_categorical(y_train) input_shape = x_train.shape[1:] lcnn = build_lcnn(input_shape, n_label=10) lcnn.compile(optimizer=Adam(learning_rate=lr), loss='categorical_crossentropy', metrics=['accuracy']) es = EarlyStopping(monitor='val_loss', patience=5 , verbose=1) history = lcnn.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, callbacks=[es]) x_test = x_test / 255 y_test = to_categorical(y_test) loss, acc = lcnn.evaluate(x_test, y_test) print(f'Accuracy : {acc*100}') # Result --> Accuracy : 75.1200020313263 print(f'Loss : {loss}')# Result --> Loss : 1.2616282165050507まとめ
LCNNという深層学習モデルを実装したのでまとめさせていただきました。
参考になれば幸いです。
Github URL : https://github.com/ozora-ogino/LCNNReference
"A Light CNN for Deep Face Representation with Noisy Labels"
"STC Antispoofing Systems for the ASVspoof2019 Challenge"
"Audio replay attack detection with deep learning frameworks"