In this blog post, I will learn a semantic segmentation problem and review fully convolutional networks. In an image for the semantic segmentation, each pixcel is usually labeled with the class of its enclosing object or region. For example, a pixcel might belongs to a road, car, building or a person. The semantic segmentation problem requires to make a classification at every pixel.
I will use Fully Convolutional Networks (FCN) to classify every pixcel.
To understand the semantic segmentation problem, let's look at an example data prepared by divamgupta. Note: I will use this example data rather than famous segmentation data e.g., pascal VOC2012 because it requires pre-processing.
Reference¶
First, I download data from:
https://drive.google.com/file/d/0B0d9ZiqAgFkiOHR1NTJhWVJMNEU/view
and save the downloaded data1 folder in the current directory.
dir_data = "dataset1/"
dir_seg = dir_data + "/annotations_prepped_train/"
dir_img = dir_data + "/images_prepped_train/"
Visualize a single segmentation image¶
In this data, there are 12 segmentation classes and the image is from a driving car.
import cv2, os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
## seaborn has white grid by default so I will get rid of this.
sns.set_style("whitegrid", {'axes.grid' : False})
ldseg = np.array(os.listdir(dir_seg))
## pick the first image file
fnm = ldseg[0]
print(fnm)
## read in the original image and segmentation labels
seg = cv2.imread(dir_seg + fnm ) # (360, 480, 3)
img_is = cv2.imread(dir_img + fnm )
print("seg.shape={}, img_is.shape={}".format(seg.shape,img_is.shape))
## Check the number of labels
mi, ma = np.min(seg), np.max(seg)
n_classes = ma - mi + 1
print("minimum seg = {}, maximum seg = {}, Total number of segmentation classes = {}".format(mi,ma, n_classes))
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(1,1,1)
ax.imshow(img_is)
ax.set_title("original image")
plt.show()
fig = plt.figure(figsize=(15,10))
for k in range(mi,ma+1):
ax = fig.add_subplot(3,n_classes/3,k+1)
ax.imshow((seg == k)*1.0)
ax.set_title("label = {}".format(k))
plt.show()
Data preprocessing: Resize image¶
To simplify the problem, I will reshape all the images to the same size: (224,224). Why (224,224)? This is the iamge shape used in VGG and FCN model in this blog uses a network that takes advantage of VGG structure. The FCN model becomes easier to explain when the image shape is (224,224). However, FCN does not requires the image shape to be (224,224).
Let's visualize how the resizing make the images look like. The images look fine.
import random
def give_color_to_seg_img(seg,n_classes):
'''
seg : (input_width,input_height,3)
'''
if len(seg.shape)==3:
seg = seg[:,:,0]
seg_img = np.zeros( (seg.shape[0],seg.shape[1],3) ).astype('float')
colors = sns.color_palette("hls", n_classes)
for c in range(n_classes):
segc = (seg == c)
seg_img[:,:,0] += (segc*( colors[c][0] ))
seg_img[:,:,1] += (segc*( colors[c][1] ))
seg_img[:,:,2] += (segc*( colors[c][2] ))
return(seg_img)
input_height , input_width = 224 , 224
output_height , output_width = 224 , 224
ldseg = np.array(os.listdir(dir_seg))
for fnm in ldseg[np.random.choice(len(ldseg),3,replace=False)]:
fnm = fnm.split(".")[0]
seg = cv2.imread(dir_seg + fnm + ".png") # (360, 480, 3)
img_is = cv2.imread(dir_img + fnm + ".png")
seg_img = give_color_to_seg_img(seg,n_classes)
fig = plt.figure(figsize=(20,40))
ax = fig.add_subplot(1,4,1)
ax.imshow(seg_img)
ax = fig.add_subplot(1,4,2)
ax.imshow(img_is/255.0)
ax.set_title("original image {}".format(img_is.shape[:2]))
ax = fig.add_subplot(1,4,3)
ax.imshow(cv2.resize(seg_img,(input_height , input_width)))
ax = fig.add_subplot(1,4,4)
ax.imshow(cv2.resize(img_is,(output_height , output_width))/255.0)
ax.set_title("resized to {}".format((output_height , output_width)))
plt.show()
Resize all the images. We have 367 images in this dataset.
def getImageArr( path , width , height ):
img = cv2.imread(path, 1)
img = np.float32(cv2.resize(img, ( width , height ))) / 127.5 - 1
return img
def getSegmentationArr( path , nClasses , width , height ):
seg_labels = np.zeros(( height , width , nClasses ))
img = cv2.imread(path, 1)
img = cv2.resize(img, ( width , height ))
img = img[:, : , 0]
for c in range(nClasses):
seg_labels[: , : , c ] = (img == c ).astype(int)
##seg_labels = np.reshape(seg_labels, ( width*height,nClasses ))
return seg_labels
images = os.listdir(dir_img)
images.sort()
segmentations = os.listdir(dir_seg)
segmentations.sort()
X = []
Y = []
for im , seg in zip(images,segmentations) :
X.append( getImageArr(dir_img + im , input_width , input_height ) )
Y.append( getSegmentationArr( dir_seg + seg , n_classes , output_width , output_height ) )
X, Y = np.array(X) , np.array(Y)
print(X.shape,Y.shape)
Import Keras and Tensorflow to develop deep learning FCN models¶
## Import usual libraries
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
import keras, sys, time, warnings
from keras.models import *
from keras.layers import *
import pandas as pd
warnings.filterwarnings("ignore")
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.95
config.gpu_options.visible_device_list = "2"
set_session(tf.Session(config=config))
print("python {}".format(sys.version))
print("keras version {}".format(keras.__version__)); del keras
print("tensorflow version {}".format(tf.__version__))
From classifier to dense FCN¶
The recent successful deep learning models such as VGG are originally designed for classification task. The network stacks convolution layers together with down-sampling layers, such as max-pooling, and then finally stacks fully connected layers. Appending a fully connected layer enables the network to learn something using global information where the spatial arrangement of the input falls away.
Fully convosutional network¶
For the segmentation task, however, spatial infomation should be stored to make a pixcel-wise classification. FCN allows this by making all the layers of VGG to convolusional layers.
Fully convolutional indicates that the neural network is composed of convolutional layers without any fully-connected layers usually found at the end of the network. Fully Convolutional Networks for Semantic Segmentation motivates the use of fully convolutional networks by "convolutionalizing" popular CNN architectures e.g. VGG can also be viewed as FCN.
... fully connected layers can also be viewed as convolutions with kernels that cover their entire input regions. Doing so casts them into fully convolutional networks that take input of any size and output classification maps. (Section 3.1)
The model I used in this blog post is FCN8 from Fully Convolutional Networks for Semantic Segmentation. It deplicates VGG16 net by discarding the final classifier layer and convert all fully connected layers to convolutions. Fully Convolutional Networks for Semantic Segmentation appends a 1 x 1 convolution with channel dimension the same as the number of segmentation classes (in our case, this is 12) to predict scores at each of the coarse output locations, followed by upsampling deconvolution layers which brings back low resolution image to the output image size. In our example, output image size is (output_height, output_width) = (224,224).
Upsampling¶
The upsampling layer brings low resolution image to high resolution. There are various upsamping methods. This presentation gives a good overview. For example, one may double the image resolution by duplicating each pixcel twice. This is so-called nearest neighbor approach and implemented in Keras's UpSampling2D. Another method may be bilinear upsampling, which linearly interpolates the nearest four inputs.
These upsampling layers do not have weights/parameters so the model is not flexible. Instead, FCN8 uses upsampling procedure called backwards convolusion (sometimes called deconvolution) with some output stride. This method simply reverses the forward and backward passes of convolution and implemented in Keras's Conv2DTranspose. This deconvolusion upsampling layer is well explained in this blog post: Up-sampling with Transposed Convolution.
In FCN8, the upsampling layer is followed by several skip connections. See details at Fully Convolutional Networks for Semantic Segmentation.
I downloaded VGG16 weights from fchollet's Github This is a massive .h5 file (57MB).
## location of VGG weights
VGG_Weights_path = "../FacialKeypoint/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5"
def FCN8( nClasses , input_height=224, input_width=224):
## input_height and width must be devisible by 32 because maxpooling with filter size = (2,2) is operated 5 times,
## which makes the input_height and width 2^5 = 32 times smaller
assert input_height%32 == 0
assert input_width%32 == 0
IMAGE_ORDERING = "channels_last"
img_input = Input(shape=(input_height,input_width, 3)) ## Assume 224,224,3
## Block 1
x = Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv1', data_format=IMAGE_ORDERING )(img_input)
x = Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv2', data_format=IMAGE_ORDERING )(x)
x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool', data_format=IMAGE_ORDERING )(x)
f1 = x
# Block 2
x = Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv1', data_format=IMAGE_ORDERING )(x)
x = Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv2', data_format=IMAGE_ORDERING )(x)
x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool', data_format=IMAGE_ORDERING )(x)
f2 = x
# Block 3
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv1', data_format=IMAGE_ORDERING )(x)
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv2', data_format=IMAGE_ORDERING )(x)
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv3', data_format=IMAGE_ORDERING )(x)
x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool', data_format=IMAGE_ORDERING )(x)
pool3 = x
# Block 4
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv1', data_format=IMAGE_ORDERING )(x)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv2', data_format=IMAGE_ORDERING )(x)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv3', data_format=IMAGE_ORDERING )(x)
pool4 = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool', data_format=IMAGE_ORDERING )(x)## (None, 14, 14, 512)
# Block 5
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv1', data_format=IMAGE_ORDERING )(pool4)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv2', data_format=IMAGE_ORDERING )(x)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv3', data_format=IMAGE_ORDERING )(x)
pool5 = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool', data_format=IMAGE_ORDERING )(x)## (None, 7, 7, 512)
#x = Flatten(name='flatten')(x)
#x = Dense(4096, activation='relu', name='fc1')(x)
# <--> o = ( Conv2D( 4096 , ( 7 , 7 ) , activation='relu' , padding='same', data_format=IMAGE_ORDERING))(o)
# assuming that the input_height = input_width = 224 as in VGG data
#x = Dense(4096, activation='relu', name='fc2')(x)
# <--> o = ( Conv2D( 4096 , ( 1 , 1 ) , activation='relu' , padding='same', data_format=IMAGE_ORDERING))(o)
# assuming that the input_height = input_width = 224 as in VGG data
#x = Dense(1000 , activation='softmax', name='predictions')(x)
# <--> o = ( Conv2D( nClasses , ( 1 , 1 ) ,kernel_initializer='he_normal' , data_format=IMAGE_ORDERING))(o)
# assuming that the input_height = input_width = 224 as in VGG data
vgg = Model( img_input , pool5 )
vgg.load_weights(VGG_Weights_path) ## loading VGG weights for the encoder parts of FCN8
n = 4096
o = ( Conv2D( n , ( 7 , 7 ) , activation='relu' , padding='same', name="conv6", data_format=IMAGE_ORDERING))(pool5)
conv7 = ( Conv2D( n , ( 1 , 1 ) , activation='relu' , padding='same', name="conv7", data_format=IMAGE_ORDERING))(o)
## 4 times upsamping for pool4 layer
conv7_4 = Conv2DTranspose( nClasses , kernel_size=(4,4) , strides=(4,4) , use_bias=False, data_format=IMAGE_ORDERING )(conv7)
## (None, 224, 224, 10)
## 2 times upsampling for pool411
pool411 = ( Conv2D( nClasses , ( 1 , 1 ) , activation='relu' , padding='same', name="pool4_11", data_format=IMAGE_ORDERING))(pool4)
pool411_2 = (Conv2DTranspose( nClasses , kernel_size=(2,2) , strides=(2,2) , use_bias=False, data_format=IMAGE_ORDERING ))(pool411)
pool311 = ( Conv2D( nClasses , ( 1 , 1 ) , activation='relu' , padding='same', name="pool3_11", data_format=IMAGE_ORDERING))(pool3)
o = Add(name="add")([pool411_2, pool311, conv7_4 ])
o = Conv2DTranspose( nClasses , kernel_size=(8,8) , strides=(8,8) , use_bias=False, data_format=IMAGE_ORDERING )(o)
o = (Activation('softmax'))(o)
model = Model(img_input, o)
return model
model = FCN8(nClasses = n_classes,
input_height = 224,
input_width = 224)
model.summary()
Split between training and testing data¶
from sklearn.utils import shuffle
train_rate = 0.85
index_train = np.random.choice(X.shape[0],int(X.shape[0]*train_rate),replace=False)
index_test = list(set(range(X.shape[0])) - set(index_train))
X, Y = shuffle(X,Y)
X_train, y_train = X[index_train],Y[index_train]
X_test, y_test = X[index_test],Y[index_test]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
Training starts here¶
from keras import optimizers
sgd = optimizers.SGD(lr=1E-2, decay=5**(-4), momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
optimizer=sgd,
metrics=['accuracy'])
hist1 = model.fit(X_train,y_train,
validation_data=(X_test,y_test),
batch_size=32,epochs=200,verbose=2)
Plot the change in loss over epochs¶
for key in ['loss', 'val_loss']:
plt.plot(hist1.history[key],label=key)
plt.legend()
plt.show()
Calculate intersection over union for each segmentation class¶
y_pred = model.predict(X_test)
y_predi = np.argmax(y_pred, axis=3)
y_testi = np.argmax(y_test, axis=3)
print(y_testi.shape,y_predi.shape)
def IoU(Yi,y_predi):
## mean Intersection over Union
## Mean IoU = TP/(FN + TP + FP)
IoUs = []
Nclass = int(np.max(Yi)) + 1
for c in range(Nclass):
TP = np.sum( (Yi == c)&(y_predi==c) )
FP = np.sum( (Yi != c)&(y_predi==c) )
FN = np.sum( (Yi == c)&(y_predi != c))
IoU = TP/float(TP + FP + FN)
print("class {:02.0f}: #TP={:6.0f}, #FP={:6.0f}, #FN={:5.0f}, IoU={:4.3f}".format(c,TP,FP,FN,IoU))
IoUs.append(IoU)
mIoU = np.mean(IoUs)
print("_________________")
print("Mean IoU: {:4.3f}".format(mIoU))
IoU(y_testi,y_predi)
Visualize the model performance¶
Looks reasonable!
shape = (224,224)
n_classes= 10
for i in range(10):
img_is = (X_test[i] + 1)*(255.0/2)
seg = y_predi[i]
segtest = y_testi[i]
fig = plt.figure(figsize=(10,30))
ax = fig.add_subplot(1,3,1)
ax.imshow(img_is/255.0)
ax.set_title("original")
ax = fig.add_subplot(1,3,2)
ax.imshow(give_color_to_seg_img(seg,n_classes))
ax.set_title("predicted class")
ax = fig.add_subplot(1,3,3)
ax.imshow(give_color_to_seg_img(segtest,n_classes))
ax.set_title("true class")
plt.show()