皮肤癌是皮肤细胞的异常生长产生的癌症,它是最常见的癌症之一,而且可能致命。但是如果及早发现,您的皮肤科医生可以对其进行治疗并彻底消除。
使用深度学习和神经网络,我们将能够对良性和恶性皮肤疾病进行分类,这可能有助于医生在早期阶段诊断出癌症。在本教程中,我们将创建一个皮肤疾病分类器,尝试使用Python中的TensorFlow框架仅从图像中区分良性(痣和脂溢性角化病)和恶性(黑素瘤)皮肤病。
好了,我们来一步一步操作吧。
▊ 安装所需的库:
pip3 install tensorflow tensorflow_hub matplotlib seaborn numpy pandas sklearn imblearn
打开一个新的笔记本(或bfwstudio)并导入必要的模块:
import tensorflow as tf import tensorflow_hub as hub import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from tensorflow.keras.utils import get_file from sklearn.metrics import roc_curve, auc, confusion_matrix from imblearn.metrics import sensitivity_score, specificity_score import os import glob import zipfile import random # to get consistent results after multiple runs tf.random.set_seed(7) np.random.seed(7) random.seed(7) # 0 for benign, 1 for malignant class_names = ["benign", "malignant"]
def download_and_extract_dataset(): # dataset from https://github.com/udacity/dermatologist-ai # 5.3GB train_url = "https://s3-us-west-1.amazonaws.com/udacity-dlnfd/datasets/skin-cancer/train.zip" # 824.5MB valid_url = "https://s3-us-west-1.amazonaws.com/udacity-dlnfd/datasets/skin-cancer/valid.zip" # 5.1GB test_url = "https://s3-us-west-1.amazonaws.com/udacity-dlnfd/datasets/skin-cancer/test.zip" for i, download_link in enumerate([valid_url, train_url, test_url]): temp_file = f"temp{i}.zip" data_dir = get_file(origin=download_link, fname=os.path.join(os.getcwd(), temp_file)) print("Extracting", download_link) with zipfile.ZipFile(data_dir, "r") as z: z.extractall("data") # remove the temp file os.remove(temp_file) # comment the below line if you already downloaded the dataset
# preparing data # generate CSV metadata file to read img paths and labels from it def generate_csv(folder, labels): folder_name = os.path.basename(folder) # convert comma separated labels into a list label2int = {} if labels: labels = labels.split(",") for label in labels: string_label, integer_label = label.split("=") label2int[string_label] = integer_label labels = list(label2int) # generate CSV file df = pd.DataFrame(columns=["filepath", "label"]) i = 0 for label in labels: print("Reading", os.path.join(folder, label, "*")) for filepath in glob.glob(os.path.join(folder, label, "*")): df.loc[i] = [filepath, label2int[label]] i += 1 output_file = f"{folder_name}.csv" print("Saving", output_file) df.to_csv(output_file) # generate CSV files for all data portions, labeling nevus and seborrheic keratosis # as 0 (benign), and melanoma as 1 (malignant) # you should replace "data" path to your extracted dataset path # don't replace if you used download_and_extract_dataset() function generate_csv("data/train", {"nevus": 0, "seborrheic_keratosis": 0, "melanoma": 1}) generate_csv("data/valid", {"nevus": 0, "seborrheic_keratosis": 0, "melanoma": 1}) generate_csv("data/test", {"nevus": 0, "seborrheic_keratosis": 0, "melanoma": 1})
# loading data train_metadata_filename = "train.csv" valid_metadata_filename = "valid.csv" # load CSV files as DataFrames df_train = pd.read_csv(train_metadata_filename) df_valid = pd.read_csv(valid_metadata_filename) n_training_samples = len(df_train) n_validation_samples = len(df_valid) print("Number of training samples:", n_training_samples) print("Number of validation samples:", n_validation_samples) train_ds = tf.data.Dataset.from_tensor_slices((df_train["filepath"], df_train["label"])) valid_ds = tf.data.Dataset.from_tensor_slices((df_valid["filepath"], df_valid["label"]))
Number of training samples: 2000 Number of validation samples: 150让我们加载图像:
# preprocess data def decode_img(img): # convert the compressed string to a 3D uint8 tensor img = tf.image.decode_jpeg(img, channels=3) # Use `convert_image_dtype` to convert to floats in the [0,1] range. img = tf.image.convert_image_dtype(img, tf.float32) # resize the image to the desired size. return tf.image.resize(img, [299, 299]) def process_path(filepath, label): # load the raw data from the file as a string img = tf.io.read_file(filepath) img = decode_img(img) return img, label valid_ds = valid_ds.map(process_path) train_ds = train_ds.map(process_path) # test_ds = test_ds for image, label in train_ds.take(1): print("Image shape:", image.shape) print("Label:", label.numpy())
上面的代码使用map()方法process_path()在两组样本中的每个样本上执行函数,它将基本上加载图像,解码图像格式,将图像像素转换为该范围[0, 1]并将其调整为(299, 299, 3),然后拍摄一张图像并打印形状:
Image shape: (299, 299, 3) Label: 0一切都按预期进行,现在让我们准备进行训练的数据集:
# training parameters batch_size = 64 optimizer = "rmsprop" def prepare_for_training(ds, cache=True, batch_size=64, shuffle_buffer_size=1000): if cache: if isinstance(cache, str): ds = ds.cache(cache) else: ds = ds.cache() # shuffle the dataset ds = ds.shuffle(buffer_size=shuffle_buffer_size) # Repeat forever ds = ds.repeat() # split to batches ds = ds.batch(batch_size) # `prefetch` lets the dataset fetch batches in the background while the model # is training. ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) return ds valid_ds = prepare_for_training(valid_ds, batch_size=batch_size, cache="valid-cached-data") train_ds = prepare_for_training(train_ds, batch_size=batch_size, cache="train-cached-data")
batch = next(iter(valid_ds)) def show_batch(batch): plt.figure(figsize=(12,12)) for n in range(25): ax = plt.subplot(5,5,n+1) plt.imshow(batch[0][n]) plt.title(class_names[batch[1][n].numpy()].title()) plt.axis('off')
# building the model # InceptionV3 model & pre-trained weights module_url = "https://tfhub.dev/google/tf2-preview/inception_v3/feature_vector/4" m = tf.keras.Sequential([ hub.KerasLayer(module_url, output_shape=[2048], trainable=False), tf.keras.layers.Dense(1, activation="sigmoid") ]) m.build([None, 299, 299, 3]) m.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"]) m.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= keras_layer (KerasLayer) multiple 21802784 _________________________________________________________________ dense (Dense) multiple 2049 ================================================================= Total params: 21,804,833 Trainable params: 2,049 Non-trainable params: 21,802,784
model_name = f"benign-vs-malignant_{batch_size}_{optimizer}" tensorboard = tf.keras.callbacks.TensorBoard(log_dir=os.path.join("logs", model_name)) # saves model checkpoint whenever we reach better weights modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(model_name + "_{val_loss:.3f}.h5", save_best_only=True, verbose=1) history = m.fit(train_ds, validation_data=valid_ds, steps_per_epoch=n_training_samples // batch_size, validation_steps=n_validation_samples // batch_size, verbose=1, epochs=100, callbacks=[tensorboard, modelcheckpoint])
Train for 31 steps, validate for 2 steps Epoch 1/100 30/31 [============================>.] - ETA: 9s - loss: 0.4609 - accuracy: 0.7760 Epoch 00001: val_loss improved from inf to 0.49703, saving model to benign-vs-malignant_64_rmsprop_0.497.h5 31/31 [==============================] - 282s 9s/step - loss: 0.4646 - accuracy: 0.7722 - val_loss: 0.4970 - val_accuracy: 0.8125 <..SNIPED..> Epoch 27/100 30/31 [============================>.] - ETA: 0s - loss: 0.2982 - accuracy: 0.8708 Epoch 00027: val_loss improved from 0.40253 to 0.38991, saving model to benign-vs-malignant_64_rmsprop_0.390.h5 31/31 [==============================] - 21s 691ms/step - loss: 0.3025 - accuracy: 0.8684 - val_loss: 0.3899 - val_accuracy: 0.8359 <..SNIPED..> Epoch 41/100 30/31 [============================>.] - ETA: 0s - loss: 0.2800 - accuracy: 0.8802 Epoch 00041: val_loss did not improve from 0.38991 31/31 [==============================] - 21s 690ms/step - loss: 0.2829 - accuracy: 0.8790 - val_loss: 0.3948 - val_accuracy: 0.8281 Epoch 42/100 30/31 [============================>.] - ETA: 0s - loss: 0.2680 - accuracy: 0.8859 Epoch 00042: val_loss did not improve from 0.38991 31/31 [==============================] - 21s 693ms/step - loss: 0.2722 - accuracy: 0.8831 - val_loss: 0.4572 - val_accuracy: 0.8047
# evaluation # load testing set test_metadata_filename = "test.csv" df_test = pd.read_csv(test_metadata_filename) n_testing_samples = len(df_test) print("Number of testing samples:", n_testing_samples) test_ds = tf.data.Dataset.from_tensor_slices((df_test["filepath"], df_test["label"])) def prepare_for_testing(ds, cache=True, shuffle_buffer_size=1000): if cache: if isinstance(cache, str): ds = ds.cache(cache) else: ds = ds.cache() ds = ds.shuffle(buffer_size=shuffle_buffer_size) return ds test_ds = test_ds.map(process_path) test_ds = prepare_for_testing(test_ds, cache="test-cached-data")
Number of testing samples: 600
# convert testing set to numpy array to fit in memory (don't do that when testing # set is too large) y_test = np.zeros((n_testing_samples,)) X_test = np.zeros((n_testing_samples, 299, 299, 3)) for i, (img, label) in enumerate(test_ds.take(n_testing_samples)): # print(img.shape, label.shape) X_test[i] = img y_test[i] = label.numpy() print("y_test.shape:", y_test.shape)
# load the weights with the least loss m.load_weights("benign-vs-malignant_64_rmsprop_0.390.h5")
print("Evaluating the model...") loss, accuracy = m.evaluate(X_test, y_test, verbose=0) print("Loss:", loss, " Accuracy:", accuracy)
def get_predictions(threshold=None): """ Returns predictions for binary classification given `threshold` For instance, if threshold is 0.3, then it'll output 1 (malignant) for that sample if the probability of 1 is 30% or more (instead of 50%) """ y_pred = m.predict(X_test) if not threshold: threshold = 0.5 result = np.zeros((n_testing_samples,)) for i in range(n_testing_samples): # test melanoma probability if y_pred[i][0] >= threshold: result[i] = 1 # else, it's 0 (benign) return result threshold = 0.23 # get predictions with 23% threshold # which means if the model is 23% sure or more that is malignant, # it's assigned as malignant, otherwise it's benign y_pred = get_predictions(threshold)
def plot_confusion_matrix(y_test, y_pred): cmn = confusion_matrix(y_test, y_pred) # Normalise cmn = cmn.astype('float') / cmn.sum(axis=1)[:, np.newaxis] # print it print(cmn) fig, ax = plt.subplots(figsize=(10,10)) sns.heatmap(cmn, annot=True, fmt='.2f', xticklabels=[f"pred_{c}" for c in class_names], yticklabels=[f"true_{c}" for c in class_names], cmap="Blues" ) plt.ylabel('Actual') plt.xlabel('Predicted') # plot the resulting confusion matrix plt.show() plot_confusion_matrix(y_test, y_pred)
sensitivity = sensitivity_score(y_test, y_pred) specificity = specificity_score(y_test, y_pred) print("Melanoma Sensitivity:", sensitivity) print("Melanoma Specificity:", specificity)
def plot_roc_auc(y_true, y_pred): """ This function plots the ROC curves and provides the scores. """ # prepare for figure plt.figure() fpr, tpr, _ = roc_curve(y_true, y_pred) # obtain ROC AUC roc_auc = auc(fpr, tpr) # print score print(f"ROC AUC: {roc_auc:.3f}") # plot ROC curve plt.plot(fpr, tpr, color="blue", lw=2, label='ROC curve (area = {f:.2f})'.format(d=1, f=roc_auc)) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC curves') plt.legend(loc="lower right") plt.show() plot_roc_auc(y_test, y_pred)
网友评论0