Finished example_4, improved example_3, cleanup

lars76 · lars76 · commit 45b0e68d6912 · 2019-01-13T20:04:17.000+01:00
diff --git a/README.md b/README.md
@@ -11,12 +11,11 @@ Before getting started, we have to download a dataset and generate a csv file co
 3. tar xf images.tar.gz
 4. tar xf annotations.tar.gz
 5. mv annotations/xmls/* images/
-6. Optionally for data augmentations: pip3 install imgaug
-7. python3 generate_dataset.py
+6. python3 generate_dataset.py
 
 # Single-object detection
 
-## Example 1: Find the dogs/cats
+## Example 1: Finding dogs/cats
 
 ### Architecture
 
@@ -35,7 +34,7 @@ We proceed in the same way to build the object detector:
 3. Add one/multiple/no convolution block (or `_inverted_res_block` for MobileNetv2)
 4. Add a convolution layer for the coordinates
 
-The code in this repository uses MobileNetv2 [1], because it is faster than other models and the performance can be adapted. For example, if alpha = 0.35 with 96x96 is not good enough, one can just increase both values (see [2] for a comparison). If you use another architecture, change `preprocess_input`.
+The code in this repository uses MobileNetv2, because it is faster than other models and the performance can be adapted. For example, if alpha = 0.35 with 96x96 is not good enough, one can just increase both values (see [here](https://github.com/keras-team/keras-applications/blob/master/keras_applications/mobilenet_v2.py) for a comparison). If you use another architecture, change `preprocess_input`.
 
 1. `python3 example_1/train.py`
 2. Adjust the WEIGHTS_FILE in `example_1/test.py` (given by the last script)
@@ -49,7 +48,7 @@ In the following images red is the predicted box, green is the ground truth:
 
 ![Image 2](https://i.imgur.com/ll9PNOF.jpg)
 
-## Example 2: Find the dogs/cats and distinguish classes
+## Example 2: Finding dogs/cats and distinguishing classes
 
 This time we have to run the scripts `example_2/train.py` and `example_2/test.py`.
 
@@ -73,12 +72,22 @@ In this example, we use a skip-net architecture similar to U-Net. For an in-dept
 
 ![Dog](https://lars76.github.io/assets/images/dog2.gif)
 
+## Example 4: YOLO-like detection
+
+### Architecture
+
+This example is based on the three YOLO papers. For an in-depth explanation see [this blog post](https://lars76.github.io/neural-networks/object-detection/obj-detection-from-scratch/).
+
+### Result
+
+![Multiple dogs](https://lars76.github.io/assets/images/multiple_dogs.jpg)
+
 # Guidelines
 
 ## Improve accuracy (IoU)
 
-- enable augmentations: set `AUGMENTATION=True` in generate_dataset.py and install *imgaug*.
-- better augmentations: increase `AUGMENTATION_PER_IMAGE` and try out different transformations.
+- enable augmentations: see `example_4` the same code can be added to the other examples
+- better augmentations: try out different values (flips, rotation etc.)
 - for MobileNetv1/2: increase `ALPHA` and `IMAGE_SIZE` in train_model.py
 - other architectures: increase `IMAGE_SIZE`
 - add more layers
@@ -98,9 +107,3 @@ In this example, we use a skip-net architecture similar to U-Net. For an in-dept
 - If the new dataset is small and not similar to ImageNet, freeze some layers.
 - If the new dataset is large, freeze no layers.
 - read http://cs231n.github.io/transfer-learning/
-
-# References
-
-[1] M. Sandler, A. Howard, M. Zhu, A. Zhmoginov, L.-C. Chen. *MobileNetV2: Inverted Residuals and Linear Bottlenecks*.
-
-[2] https://github.com/keras-team/keras-applications/blob/master/keras_applications/mobilenet_v2.py
diff --git a/example_3/test.py b/example_3/test.py
@@ -2,7 +2,7 @@
 import cv2
 import glob
 
-WEIGHTS_FILE = "model-0.34.h5"
+WEIGHTS_FILE = "model-0.91.h5"
 IMAGES = "images/*jpg"
 THRESHOLD = 0.5
 EPSILON = 0.02
@@ -13,28 +13,24 @@ def main():
 
     for filename in glob.glob(IMAGES):
         unscaled = cv2.imread(filename)
-        image = cv2.resize(unscaled, (IMAGE_WIDTH, IMAGE_HEIGHT))
+        image = cv2.resize(unscaled, (IMAGE_SIZE, IMAGE_SIZE))
         feat_scaled = preprocess_input(np.array(image, dtype=np.float32))
 
-        region = model.predict(x=np.array([feat_scaled]))[0]
+        region = np.squeeze(model.predict(feat_scaled[np.newaxis,:]))
 
-        output = np.zeros(unscaled.shape[:2], dtype=np.uint8)
-        for i in range(region.shape[1]):
-            for j in range(region.shape[0]):
-                if region[i][j] > THRESHOLD:
-                    x = int(CELL_WIDTH * j * unscaled.shape[1] / IMAGE_WIDTH)
-                    y = int(CELL_HEIGHT * i * unscaled.shape[0] / IMAGE_HEIGHT)
-                    x2 = int(CELL_WIDTH * (j + 1) * unscaled.shape[1] / IMAGE_WIDTH)
-                    y2 = int(CELL_HEIGHT * (i + 1) * unscaled.shape[0] / IMAGE_HEIGHT)
-                    #cv2.rectangle(unscaled, (x, y), (x2, y2), (0, 255, 0), 1)
+        output = np.zeros(region.shape, dtype=np.uint8)
+        output[region > 0.5] = 1
 
-                    output[y:y2,x:x2] = 1
-
-        _, contours, _ = cv2.findContours(output, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+        contours, _ = cv2.findContours(output, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
         for cnt in contours:
             approx = cv2.approxPolyDP(cnt, EPSILON * cv2.arcLength(cnt, True), True)
             x, y, w, h = cv2.boundingRect(approx)
-            cv2.rectangle(unscaled, (x, y), (x + w, y + h), (0, 255, 0), 1)
+
+            x0 = np.rint(x * unscaled.shape[1] / output.shape[1]).astype(int)
+            x1 = np.rint((x + w) * unscaled.shape[1] / output.shape[1]).astype(int)
+            y0 = np.rint(y * unscaled.shape[0] / output.shape[0]).astype(int)
+            y1 = np.rint((y + h) * unscaled.shape[0] / output.shape[0]).astype(int)
+            cv2.rectangle(unscaled, (x0, y0), (x1, y1), (0, 255, 0), 1)
 
         cv2.imshow("image", unscaled)
         cv2.waitKey(0)
diff --git a/example_3/train.py b/example_3/train.py
@@ -13,18 +13,15 @@
 from tensorflow.keras.losses import binary_crossentropy
 from tensorflow.keras.backend import epsilon
 
-
 # 0.35, 0.5, 0.75, 1.0
 ALPHA = 1.0
 
-IMAGE_HEIGHT = 224
-IMAGE_WIDTH = 224
-
-HEIGHT_CELLS = 28
-WIDTH_CELLS = 28
+GRID_SIZE = 28
+IMAGE_SIZE = 224
 
-CELL_WIDTH = IMAGE_WIDTH / WIDTH_CELLS
-CELL_HEIGHT = IMAGE_HEIGHT / HEIGHT_CELLS
+# first train with frozen weights, then fine tune
+TRAINABLE = False
+WEIGHTS = "model-0.89.h5"
 
 EPOCHS = 200
 BATCH_SIZE = 8
@@ -42,7 +39,7 @@ def __init__(self, csv_file):
         self.paths = []
 
         with open(csv_file, "r") as file:
-            self.mask = np.zeros((sum(1 for line in file), HEIGHT_CELLS, WIDTH_CELLS))
+            self.mask = np.zeros((sum(1 for line in file), GRID_SIZE, GRID_SIZE))
             file.seek(0)
 
             reader = csv.reader(file, delimiter=",")
@@ -53,18 +50,13 @@ def __init__(self, csv_file):
 
                 path, image_height, image_width, x0, y0, x1, y1, _, _ = row
 
-                x0 *= IMAGE_WIDTH / image_width
-                y0 *= IMAGE_HEIGHT / image_height
-                x1 *= IMAGE_WIDTH / image_width
-                y1 *= IMAGE_HEIGHT / image_height 
+                cell_start_x = np.rint(((GRID_SIZE - 1) / image_width) * x0).astype(int)
+                cell_stop_x = np.rint(((GRID_SIZE - 1) / image_width) * x1).astype(int)
 
-                cell_start_x = max(math.ceil(x0 / CELL_WIDTH) - 1, 0)
-                cell_stop_x = min(math.ceil(x1 / CELL_WIDTH), WIDTH_CELLS) - 1
+                cell_start_y = np.rint(((GRID_SIZE - 1) / image_height) * y0).astype(int)
+                cell_stop_y = np.rint(((GRID_SIZE - 1) / image_height) * y1).astype(int)
 
-                cell_start_y = max(math.ceil(y0 / CELL_HEIGHT) - 1, 0)
-                cell_stop_y = min(math.ceil(y1 / CELL_HEIGHT), HEIGHT_CELLS) - 1
-
-                self.mask[index, cell_start_y:cell_stop_y+1, cell_start_x:cell_stop_x+1] = 1
+                self.mask[index, cell_start_y : cell_stop_y, cell_start_x : cell_stop_x] = 1
 
                 self.paths.append(path)
 
@@ -75,16 +67,16 @@ def __getitem__(self, idx):
         batch_paths = self.paths[idx * BATCH_SIZE:(idx + 1) * BATCH_SIZE]
         batch_masks = self.mask[idx * BATCH_SIZE:(idx + 1) * BATCH_SIZE]
 
-        batch_images = np.zeros((len(batch_paths), IMAGE_HEIGHT, IMAGE_WIDTH, 3), dtype=np.float32)
+        batch_images = np.zeros((len(batch_paths), IMAGE_SIZE, IMAGE_SIZE, 3), dtype=np.float32)
         for i, f in enumerate(batch_paths):
             img = Image.open(f)
-            img = img.resize((IMAGE_WIDTH, IMAGE_HEIGHT))
+            img = img.resize((IMAGE_SIZE, IMAGE_SIZE))
             img = img.convert('RGB')
 
             batch_images[i] = preprocess_input(np.array(img, dtype=np.float32))
             img.close()
 
-        return batch_images, batch_masks
+        return batch_images, batch_masks[:,:,:,np.newaxis]
 
 class Validation(Callback):
     def __init__(self, generator):
@@ -110,7 +102,7 @@ def on_epoch_end(self, epoch, logs):
         print(" - val_dice: {}".format(dice))
 
 def create_model(trainable=True):
-    model = MobileNetV2(input_shape=(IMAGE_HEIGHT, IMAGE_WIDTH, 3), include_top=False, alpha=ALPHA, weights="imagenet")
+    model = MobileNetV2(input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3), include_top=False, alpha=ALPHA, weights="imagenet")
 
     for layer in model.layers:
         layer.trainable = trainable
@@ -136,23 +128,25 @@ def create_model(trainable=True):
         x = Activation("relu")(x)
 
     x = Conv2D(1, kernel_size=1, activation="sigmoid")(x)
-    x = Reshape((HEIGHT_CELLS, WIDTH_CELLS))(x)
 
     return Model(inputs=model.input, outputs=x)
 
 def loss(y_true, y_pred):
     def dice_coefficient(y_true, y_pred):
-        numerator = 2 * tf.reduce_sum(y_true * y_pred)
-        denominator = tf.reduce_sum(y_true + y_pred)
+        numerator = 2 * tf.reduce_sum(y_true * y_pred, axis=-1)
+        denominator = tf.reduce_sum(y_true + y_pred, axis=-1)
 
         return numerator / (denominator + epsilon())
 
     return binary_crossentropy(y_true, y_pred) - tf.log(dice_coefficient(y_true, y_pred) + epsilon())
 
 def main():
-    model = create_model()
+    model = create_model(trainable=TRAINABLE)
     model.summary()
 
+    if TRAINABLE:
+        model.load_weights(WEIGHTS)
+
     train_datagen = DataGenerator(TRAIN_CSV)
     validation_datagen = Validation(generator=DataGenerator(VALIDATION_CSV))
 
diff --git a/example_4/test.py b/example_4/test.py
@@ -3,42 +3,42 @@
 import glob
 import numpy as np
 
-WEIGHTS_FILE = "model-0.37.h5"
+WEIGHTS_FILE = "model-0.51.h5"
 IMAGES = "images/*jpg"
 
 IOU_THRESHOLD = 0.5
 SCORE_THRESHOLD = 0.5
-MAX_OUTPUT_SIZE = 300
+MAX_OUTPUT_SIZE = 49
 
 def main():
     model = create_model()
     model.load_weights(WEIGHTS_FILE)
 
     for filename in glob.glob(IMAGES):
         unscaled = cv2.imread(filename)
-        image = cv2.resize(unscaled, (IMAGE_SIZE, IMAGE_SIZE))
-        feat_scaled = preprocess_input(np.array(image, dtype=np.float32))
+        img = cv2.resize(unscaled, (IMAGE_SIZE, IMAGE_SIZE))
 
-        pred = model.predict(x=np.array([feat_scaled]))[0]
-        height, width, y, x, score = pred[..., 0].flatten(), pred[..., 1].flatten(), pred[..., 2].flatten(), pred[..., 3].flatten(), pred[..., 4].flatten()
+        feat_scaled = preprocess_input(np.array(img, dtype=np.float32))
+
+        pred = np.squeeze(model.predict(feat_scaled[np.newaxis,:]))
+        height, width, y_f, x_f, score = [a.flatten() for a in np.split(pred, pred.shape[-1], axis=-1)]
 
         coords = np.arange(pred.shape[0] * pred.shape[1])
-        boxes = np.stack([coords // pred.shape[0] + y + 1, coords % pred.shape[1] + x + 1, height, width, score], axis=-1)
+        y = (y_f + coords // pred.shape[0]) / (pred.shape[0] - 1)
+        x = (x_f + coords % pred.shape[1]) / (pred.shape[1] - 1)
+
+        boxes = np.stack([y, x, height, width, score], axis=-1)
         boxes = boxes[np.where(boxes[...,-1] >= SCORE_THRESHOLD)]
 
         selected_indices = tf.image.non_max_suppression(boxes[...,:-1], boxes[...,-1], MAX_OUTPUT_SIZE, IOU_THRESHOLD)
         selected_indices = tf.Session().run(selected_indices)
 
-        for k in boxes[selected_indices]:
-            h = k[2] * unscaled.shape[0]
-            w = k[3] * unscaled.shape[1]
-
-            y0 = k[0] * unscaled.shape[0] / pred.shape[0] - h / 2
-            x0 = k[1] * unscaled.shape[1] / pred.shape[1] - w / 2
-            y1 = y0 + h
-            x1 = x0 + w
+        for y_c, x_c, h, w, _ in boxes[selected_indices]:
+            x0 = unscaled.shape[1] * (x_c - w / 2)
+            y0 = unscaled.shape[0] * (y_c - h / 2)
+            x1 = x0 + unscaled.shape[1] * w
+            y1 = y0 + unscaled.shape[0] * h
 
-            #cv2.rectangle(unscaled, (int(k[1] * unscaled.shape[0] / pred.shape[0]), int(k[0] * unscaled.shape[0] / pred.shape[0])), (int(10 + k[1] * unscaled.shape[0] / pred.shape[0]), int(10 + k[0] * unscaled.shape[0] / pred.shape[0])), (0, 0, 255), 1)
             cv2.rectangle(unscaled, (int(x0), int(y0)), (int(x1), int(y1)), (0, 255, 0), 1)
 
         cv2.imshow("image", unscaled)
@@ -47,4 +47,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/example_4/train.py b/example_4/train.py
diff --git a/generate_dataset.py b/generate_dataset.py