added facial landmarks estimation and code refactor

mmphego · mmphego · commit 03ba54e54785 · 2020-07-13T04:31:29.000+02:00
Signed-off-by: Mpho Mphego &lt;mpho112@gmail.com&gt;
diff --git a/main.py b/main.py
@@ -11,8 +11,8 @@
     bash -c "source /opt/intel/openvino/bin/setupvars.sh && \
         python main.py \
             --face-model models/face-detection-adas-binary-0001 \
+            --facial-landmarks-model models/landmarks-regression-retail-0009 \
             --head-pose-model models/head-pose-estimation-adas-0001 \
-            --facial-landmarks-model models/face-detection-adas-binary-0001 \
             --gaze-model models/gaze-estimation-adas-0002 \
             --input resources/demo.mp4";
 """
@@ -117,13 +117,19 @@ def main(args):
     mouse_controller = MouseController(
         precision=args.mouse_precision, speed=args.mouse_speed
     )
+    video_feed = InputFeeder(input_file=args.input)
+
     face_detection = Face_Detection(
-        args.face_model, device=args.device, threshold=args.prob_threshold
+        model_name=args.face_model,
+        source_width=video_feed.source_width,
+        source_height=video_feed.source_height,
+        device=args.device,
+        threshold=args.prob_threshold,
     )
+    facial_landmarks = Facial_Landmarks(args.facial_landmarks_model, device=args.device)
     head_pose_estimation = Head_Pose_Estimation(
         args.head_pose_model, device=args.device
     )
-    facial_landmarks = Facial_Landmarks(args.facial_landmarks_model, device=args.device)
     gaze_estimation = Gaze_Estimation(args.gaze_model, device=args.device)
 
     model_load_time = (
@@ -134,19 +140,38 @@ def main(args):
     ) / 1000
     logger.info(f"Total time taken to load all the models: {model_load_time:.2f} secs.")
 
-    video_feed = InputFeeder(input_file=args.input)
-
-    # Add source width and height for face detection.
-    face_detection._init_image_w = video_feed.source_width
-    face_detection._init_image_h = video_feed.source_height
-
     for frame in video_feed.next_frame():
-        predict_end_time, pred_result = face_detection.predict(frame,draw=True)
-        text = f"Inference time: {predict_end_time:.2f}ms"
-        face_detection.add_text(text, frame, (15, face_detection._init_image_h - 50))
+        predict_end_time, _, face_bboxes = face_detection.predict(frame, draw=True)
+        text = f"Face Detection Inference time: {predict_end_time:.3f} s"
+        face_detection.add_text(text, frame, (15, video_feed.source_height - 80))
+
+        if face_bboxes:
+            for face_bbox in face_bboxes:
+                # Useful resource: https://www.pyimagesearch.com/2018/09/24/opencv-face-recognition/
+
+                # Face bounding box coordinates cropped from the face detection inference
+                # are face_bboxes i.e `xmin, ymin, xmax, ymax`
+                # Therefore the face can be cropped by:
+                # frame[face_bbox[1]:face_bbox[3], face_bbox[0]:face_bbox[2]]
+
+                # extract the face ROI
+                (x, y, w, h) = face_bbox
+                face = frame[y:h, x:w]
+                (face_height, face_width) = face.shape[:2]
+                #  video_feed.show(frame[y:h, x:w], "face")
+
+                # ensure the face width and height are sufficiently large
+                if face_height < 20 or face_width < 20:
+                    continue
+
+                predict_end_time, _, landmarks_bboxes = facial_landmarks.predict(face)
+                text = f"Facial Landmarks Est. Inference time: {predict_end_time:.3f} s"
+                facial_landmarks.add_text(
+                    text, frame, (15, video_feed.source_height - 60)
+                )
 
         if args.debug:
-            video_feed.show(frame)
+            video_feed.show(video_feed.resize(frame))
 
     video_feed.close()
 
diff --git a/src/input_feeder.py b/src/input_feeder.py
@@ -78,11 +78,13 @@ def progress_bar(self):
             self._progress_bar = tqdm(total=int(self.video_len - self.fps + 1))
         return self._progress_bar
 
-    def resize(self,frame):
-        return cv2.resize(frame, (self.source_width - 200, self.source_height - 200))
+    def resize(self, frame, height=None, width=None):
+        if (height and width) is None:
+            width, height = (self.source_width - 200, self.source_height - 200)
+        return cv2.resize(frame, (width, height))
 
     def show(self, frame, frame_name="video"):
-        cv2.imshow(frame_name, self.resize(frame))
+        cv2.imshow(frame_name, frame)
 
     def write_video(self, output_path=".", filename="output_video.mp4"):
         out_video = cv2.VideoWriter(
diff --git a/src/model.py b/src/model.py
@@ -25,7 +25,15 @@
 class Base(abc.ABC):
     """Model Base Class"""
 
-    def __init__(self, model_name, device="CPU", threshold=0.60, extensions=None):
+    def __init__(
+        self,
+        model_name,
+        source_width=None,
+        source_height=None,
+        device="CPU",
+        threshold=0.60,
+        extensions=None,
+    ):
         self.model_weights = f"{model_name}.bin"
         self.model_structure = f"{model_name}.xml"
         assert (
@@ -45,8 +53,8 @@ def __init__(self, model_name, device="CPU", threshold=0.60, extensions=None):
         self.input_shape = self.model.inputs[self.input_name].shape
         self.output_name = next(iter(self.model.outputs))
         self.output_shape = self.model.outputs[self.output_name].shape
-        self._init_image_w = None
-        self._init_image_h = None
+        self._init_image_w = source_width
+        self._init_image_h = source_height
         self.exec_network = None
         self.load_model()
 
@@ -91,15 +99,16 @@ def predict(self, image, request_id=0, draw=False):
             request_id=request_id, inputs={self.input_name: p_image}
         )
         status = self.exec_network.requests[request_id].wait(-1)
+        bbox = None
         if status == 0:
             predict_start_time = time.time()
             pred_result = self.exec_network.requests[request_id].outputs[
                 self.output_name
             ]
-            predict_end_time = (time.time() - predict_start_time) * 1000
+            predict_end_time = float(time.time() - predict_start_time) * 1000
             if draw:
-                self.preprocess_output(pred_result, image, show_bbox=draw)
-            return (predict_end_time, pred_result)
+                bbox, _ = self.preprocess_output(pred_result, image, show_bbox=draw)
+            return (predict_end_time, pred_result, bbox)
 
     @abc.abstractmethod
     def preprocess_output(self, inference_results, image, show_bbox=False):
@@ -128,8 +137,18 @@ def preprocess_input(self, image):
 class Face_Detection(Base):
     """Class for the Face Detection Model."""
 
-    def __init__(self, model_name, device="CPU", threshold=0.60, extensions=None):
-        super().__init__(model_name, device="CPU", threshold=0.60, extensions=None)
+    def __init__(
+        self,
+        model_name,
+        source_width=None,
+        source_height=None,
+        device="CPU",
+        threshold=0.60,
+        extensions=None,
+    ):
+        super().__init__(
+            model_name, source_width, source_height, device, threshold, extensions,
+        )
 
     def preprocess_output(self, inference_results, image, show_bbox=False):
         """Draw bounding boxes onto the frame."""
@@ -199,8 +218,18 @@ def draw_output(
 class Head_Pose_Estimation(Base):
     """Class for the Head Pose Estimation Model."""
 
-    def __init__(self, model_name, device="CPU", threshold=0.60, extensions=None):
-        super().__init__(model_name, device="CPU", threshold=0.60, extensions=None)
+    def __init__(
+        self,
+        model_name,
+        source_width=None,
+        source_height=None,
+        device="CPU",
+        threshold=0.60,
+        extensions=None,
+    ):
+        super().__init__(
+            model_name, source_width, source_height, device, threshold, extensions,
+        )
 
     def preprocess_output(self, inference_results, image):
         pass
@@ -212,8 +241,18 @@ def draw_output(coords, image):
 class Facial_Landmarks(Base):
     """Class for the Facial Landmarks Detection Model."""
 
-    def __init__(self, model_name, device="CPU", threshold=0.60, extensions=None):
-        super().__init__(model_name, device="CPU", threshold=0.60, extensions=None)
+    def __init__(
+        self,
+        model_name,
+        source_width=None,
+        source_height=None,
+        device="CPU",
+        threshold=0.60,
+        extensions=None,
+    ):
+        super().__init__(
+            model_name, source_width, source_height, device, threshold, extensions,
+        )
 
     def preprocess_output(self, inference_results, image):
         pass
@@ -225,8 +264,18 @@ def draw_output(coords, image):
 class Gaze_Estimation(Base):
     """Class for the Gaze Estimation Detection Model."""
 
-    def __init__(self, model_name, device="CPU", threshold=0.60, extensions=None):
-        super().__init__(model_name, device="CPU", threshold=0.60, extensions=None)
+    def __init__(
+        self,
+        model_name,
+        source_width=None,
+        source_height=None,
+        device="CPU",
+        threshold=0.60,
+        extensions=None,
+    ):
+        super().__init__(
+            model_name, source_width, source_height, device, threshold, extensions,
+        )
 
     def preprocess_output(self, inference_results, image):
         pass