1) Set the number of classes and size of the image (including the '3rd' dimension, even though these are greyscale)
2) Read in the files using the get_example_data utility
3) Normalize the data by 80 dBZ
4) Verify the shape of the training, validation, and testing datasets
5) Transform the "single number" classifications into keras friendly arrays.
#Based on examples from the Keras documentation
import numpy as np
np.random.seed(42)
from tensorflow import keras
from tensorflow.keras import layers
import pickle
from svrimg.utils.get_images import get_example_data
num_classes = 6
input_shape = (136, 136, 1)
(x_train, y_train) = get_example_data('training', data_dir="../data/pkls/")
(x_val, y_val) = get_example_data('validation', data_dir="../data/pkls/")
(x_test, y_test) = get_example_data('testing', data_dir="../data/pkls/")
#Normalize by 80 dBZ
x_train = x_train.astype("float32") / 80
x_test = x_test.astype("float32") / 80
x_val = x_val.astype("float32") / 80
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_val.shape[0], "validate samples")
print(x_test.shape[0], "test samples")
y_train = keras.utils.to_categorical(y_train, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
model = keras.Sequential(
[
keras.Input(shape=(136, 136, 1)),
layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
layers.SpatialDropout2D(0.3),
layers.MaxPooling2D(pool_size=(3, 3)),
layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
layers.SpatialDropout2D(0.3),
layers.MaxPooling2D(pool_size=(3, 3)),
layers.Conv2D(128, kernel_size=(3, 3), activation="relu"),
layers.SpatialDropout2D(0.3),
layers.MaxPooling2D(pool_size=(3, 3)),
layers.Flatten(),
layers.Dense(128, activation="relu"),
layers.Dropout(0.6),
layers.Dense(num_classes, activation="softmax"),
]
)
keras.utils.plot_model(model, show_shapes=True)
We can show how this works with one example.
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 8, 8
from svrimg.utils.map_helper import radar_colormap, draw_box_plot
from matplotlib.colors import BoundaryNorm
cmap = radar_colormap()
classes = np.array(list(range(0, 85, 5)))
norm = BoundaryNorm(classes, ncolors=cmap.N)
sample = x_test[52]
ax = plt.subplot(1,1,1)
draw_box_plot(ax, sample.squeeze()*80)
We should try to avoid shifting the image left and right, because the location of the storm report is right in the middle of each image. Instead, rotate the image and zoom in and out slightly. It is also important to ask yourself, does the image augmentation make sense?
We can visualize this with 9 randomly generated examples.
from keras.preprocessing.image import ImageDataGenerator
from numpy import expand_dims
plt.rcParams['figure.figsize'] = 20, 20
samples = expand_dims(sample, 0)
datagen = ImageDataGenerator(rotation_range=55, zoom_range=[0.9,1.0], fill_mode="reflect")
aug_imgs = datagen.flow(samples, batch_size=1)
for i in range(9):
ax = plt.subplot(3,3,i+1)
batch = aug_imgs.next()
draw_box_plot(ax, batch[0].squeeze()*80)
Create an image generator for the training data and validation data and pass these values into model.fit(). Wait for the model to finish 100 epochs and test how it did!
Note: modify the "workers" argument depending on what kind of CPU you have. This was tested on a 20 core machine.
epochs = 100
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit_generator(datagen.flow(x_train, y_train, batch_size=32),
epochs=epochs, validation_data=(x_val, y_val), workers=8)
Divergence of these two generally suggests overfitting. This can be addressed by image augmentation, dropout, and getting more data.
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = 10, 6
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val.'], loc='upper left')
plt.show()
If it is similar to the validation accuracy, the model may be generalizing (which is good).
score = model.evaluate(x_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])
You can see that the high accuracy is mostly due to the model doing well on Cellular, QLCS, and Tropical cases. The model actually has fantastic POD (Sensitivity/Recall) overall, but higher FAR (An aspect of precision) for Cellular and Tropical cases. At least for the test set, the model detects almost all QLCSs and has very few "QLCS False Alarms". This could be my bias towards identifying QLCSs, as I have been looking at images of QLCSs in my free time for..... longer than is healthy at this point.
76 out of 78 cellular cases, 144 out of 152 QLCS cases, and 31 out of 33 Tropical cases are properly identified!
11 Noise, 1 Missing, 3 Other, and 3 QLCSs cases are identified as Cellular (76 out of 94 (81%) Cellular Predictions were correct)
1 Noise, 2 Tropical, and 1 Cellular cases are identified as QLCS (144 out of 148 (97%) QLCS Predictions were correct)
1 Noise, 7 Other, 4 QLCS, and 1 Cellular cases are identified as Tropical (31 out of 44 (70%) Tropical Predictions were correct)
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
y_test_ = np.argmax(y_test, axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test_, y_pred))
print('Classification Report')
target_names = ['Cellular', 'QLCS', 'Tropical', 'Other', 'Missing', 'Noise']
print(classification_report(y_test_, y_pred, target_names=target_names))
from svrimg.utils.get_tables import get_svrgis_table, get_pred_tables
actual = get_pred_tables(data_dir="../data/csvs/", example=True, remove_first_row=True)
svrgis = get_svrgis_table(data_dir="../data/csvs/")
actual = actual.join(svrgis)
actual.head()
from svrimg.utils.get_images import get_img_list
plt.rcParams['figure.figsize'] = 25, 25
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
plt.rcParams['axes.labelsize'] = 20
#Testing data are 2014 and on. It is "cheating" to look at earlier data.
sample = actual[actual.yr>=2014].sample(9)
#Load the images and transform them to be "CNN-friendly"
imgs = get_img_list(sample.index.values, "../data/tor/")
imgs = expand_dims(imgs, 3)
imgs = imgs / 80 #normalize
#Identify the column with the highest probability
pred = np.argmax(model.predict(imgs), axis=1)
truth = sample['Class Code'].values
lookup = {0:'Cellular', 1:'QLCS', 2:'Tropical', 3:'Other', 4:'Noise', 5:'Missing'}
for i, (img, p) in enumerate(zip(imgs, pred)):
ax = plt.subplot(3, 3, i+1)
ax = draw_box_plot(ax, img.squeeze()*80, cbar_shrink=0.8)
ax.set_title("Prediction: {}\nActual: {}".format(lookup[p], lookup[truth[i]]), fontsize=25)
model.save("../data/models/morph_model_v01.h5")