From 67bae60e3afbf438e69da4e188bbbde7efaf202a Mon Sep 17 00:00:00 2001
From: sosokker <sirin_29@hotmail.co.th>
Date: Thu, 9 May 2024 23:07:16 +0700
Subject: [PATCH] Use ActionModel class to wrap around function that generate
 frame

---
 .../src/analytic/action/action_model.py       | 208 +++++++++---------
 StreamServer/src/routers/video.py             |   6 +-
 2 files changed, 112 insertions(+), 102 deletions(-)

diff --git a/StreamServer/src/analytic/action/action_model.py b/StreamServer/src/analytic/action/action_model.py
index d7f9858..0e8945a 100644
--- a/StreamServer/src/analytic/action/action_model.py
+++ b/StreamServer/src/analytic/action/action_model.py
@@ -49,107 +49,115 @@ def kpt2bbox(kpt, ex=20):
                      kpt[:, 0].max() + ex, kpt[:, 1].max() + ex))
 
 
-def generate_action_model_frame(source):
-    CAM_SOURCE = source
+class ActionModel:
+    def __init__(self) -> None:
+        self.ACTION_LIST = []
 
-    # Model initialization
-    detect_model = TinyYOLOv3_onecls(INP_DETS, device=DEVICE, config_file=CONFIG_FILE,
-                                     weight_file=YOLO_WEIGHT_FILE)
-    pose_model = SPPE_FastPose(POSE_BACKBONE, INP_POSE[0], INP_POSE[1], device=DEVICE, path=SPPE_WEIGHT_FILE)
-    action_model = TSSTG(weight_file=TSSTG_WEIGHT_FILE) # action model
+        # Model initialization
+        self.detect_model = TinyYOLOv3_onecls(INP_DETS, device=DEVICE, config_file=CONFIG_FILE,
+                                        weight_file=YOLO_WEIGHT_FILE)
+        self.pose_model = SPPE_FastPose(POSE_BACKBONE, INP_POSE[0], INP_POSE[1], device=DEVICE, path=SPPE_WEIGHT_FILE)
+        self.action_model = TSSTG(weight_file=TSSTG_WEIGHT_FILE) # action model
 
-    # Tracker.
-    max_age = 30
-    tracker = Tracker(max_age=max_age, n_init=3)
-
-    cam = CamLoader(int(CAM_SOURCE) if CAM_SOURCE.isdigit() else CAM_SOURCE,
-                    preprocess=preproc).start()
-
-    fps_time = 0
-    f = 0
-    while cam.grabbed():
-        f += 1
-        frame = cam.getitem()
-        image = frame.copy()
-
-        # Detect humans bbox in the frame with detector model.
-        detected = detect_model.detect(frame, need_resize=False, expand_bb=10)
-
-        # Predict each tracks bbox of current frame from previous frames information with Kalman filter.
-        tracker.predict()
-        # Merge two source of predicted bbox together.
-        for track in tracker.tracks:
-            det = torch.tensor([track.to_tlbr().tolist() + [0.5, 1.0, 0.0]], dtype=torch.float32)
-            detected = torch.cat([detected, det], dim=0) if detected is not None else det
-
-        detections = []  # List of Detections object for tracking.
-        if detected is not None:
-            #detected = non_max_suppression(detected[None, :], 0.45, 0.2)[0]
-            # Predict skeleton pose of each bboxs.
-            poses = pose_model.predict(frame, detected[:, 0:4], detected[:, 4])
-
-            # Create Detections object.
-            detections = [Detection(kpt2bbox(ps['keypoints'].numpy()),
-                                    np.concatenate((ps['keypoints'].numpy(),
-                                                    ps['kp_score'].numpy()), axis=1),
-                                    ps['kp_score'].mean().numpy()) for ps in poses]
-
-            # VISUALIZE.
-            if SHOW_DETECTED:
-                for bb in detected[:, 0:5]:
-                    frame = cv2.rectangle(frame, (bb[0], bb[1]), (bb[2], bb[3]), (0, 0, 255), 1)
-
-        # Update tracks by matching each track information of current and previous frame or
-        # create a new track if no matched.
-        tracker.update(detections)
-
-        # Predict Actions of each track.
-        for i, track in enumerate(tracker.tracks):
-            if not track.is_confirmed():
-                continue
-
-            track_id = track.track_id
-            bbox = track.to_tlbr().astype(int)
-            center = track.get_center().astype(int)
-
-            action = 'pending'
-            clr = (0, 255, 0)
-            # Use 30 frames time-steps to prediction.
-            if len(track.keypoints_list) == 30:
-                pts = np.array(track.keypoints_list, dtype=np.float32)
-                out = action_model.predict(pts, frame.shape[:2])
-                action_name = action_model.class_names[out[0].argmax()]
-                action = '{}: {:.2f}%'.format(action_name, out[0].max() * 100)
-                if action_name == 'Fall Down':
-                    clr = (255, 0, 0)
-                elif action_name == 'Lying Down':
-                    clr = (255, 200, 0)
-
-            # VISUALIZE.
-            if track.time_since_update == 0:
-                if SHOW_SKELETON:
-                    frame = draw_single(frame, track.keypoints_list[-1])
-                frame = cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 1)
-                frame = cv2.putText(frame, str(track_id), (center[0], center[1]), cv2.FONT_HERSHEY_COMPLEX,
-                                    0.4, (255, 0, 0), 2)
-                frame = cv2.putText(frame, action, (bbox[0] + 5, bbox[1] + 15), cv2.FONT_HERSHEY_COMPLEX,
-                                    0.4, clr, 1)
-
-        # Show Frame.
-        frame = cv2.resize(frame, (0, 0), fx=2., fy=2.)
-        frame = cv2.putText(frame, '%d, FPS: %f' % (f, 1.0 / (time.time() - fps_time)),
-                            (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
-        frame = frame[:, :, ::-1]
-        fps_time = time.time()
-
-        # return frame for video streaming
-        ret, buffer = cv2.imencode('.jpg', frame)
-        if not ret:
-            # If encoding fails, raise an error to stop the streaming
-            raise HTTPException(status_code=500, detail="Frame encoding failed")
-        yield (b'--frame\r\n'
-               b'Content-Type: image/jpeg\r\n\r\n' + buffer.tobytes() + b'\r\n')
+        # Tracker.
+        self.max_age = 30
+        self.tracker = Tracker(max_age=self.max_age, n_init=3)
 
 
-def output_action_detection():
-    pass
\ No newline at end of file
+    def generate_action_model_frame(self, source):
+        CAM_SOURCE = source
+        detect_model = self.detect_model
+        pose_model = self.pose_model
+        action_model = self.action_model
+        max_age = self.max_age
+        tracker = self.tracker
+        cam = CamLoader(int(CAM_SOURCE) if CAM_SOURCE.isdigit() else CAM_SOURCE,
+                        preprocess=preproc).start()
+
+        fps_time = 0
+        f = 0
+        while cam.grabbed():
+            f += 1
+            frame = cam.getitem()
+            image = frame.copy()
+
+            # Detect humans bbox in the frame with detector model.
+            detected = detect_model.detect(frame, need_resize=False, expand_bb=10)
+
+            # Predict each tracks bbox of current frame from previous frames information with Kalman filter.
+            tracker.predict()
+            # Merge two source of predicted bbox together.
+            for track in tracker.tracks:
+                det = torch.tensor([track.to_tlbr().tolist() + [0.5, 1.0, 0.0]], dtype=torch.float32)
+                detected = torch.cat([detected, det], dim=0) if detected is not None else det
+
+            detections = []  # List of Detections object for tracking.
+            if detected is not None:
+                #detected = non_max_suppression(detected[None, :], 0.45, 0.2)[0]
+                # Predict skeleton pose of each bboxs.
+                poses = pose_model.predict(frame, detected[:, 0:4], detected[:, 4])
+
+                # Create Detections object.
+                detections = [Detection(kpt2bbox(ps['keypoints'].numpy()),
+                                        np.concatenate((ps['keypoints'].numpy(),
+                                                        ps['kp_score'].numpy()), axis=1),
+                                        ps['kp_score'].mean().numpy()) for ps in poses]
+
+                # VISUALIZE.
+                if SHOW_DETECTED:
+                    for bb in detected[:, 0:5]:
+                        frame = cv2.rectangle(frame, (bb[0], bb[1]), (bb[2], bb[3]), (0, 0, 255), 1)
+
+            # Update tracks by matching each track information of current and previous frame or
+            # create a new track if no matched.
+            tracker.update(detections)
+
+            # Predict Actions of each track.
+            for i, track in enumerate(tracker.tracks):
+                if not track.is_confirmed():
+                    continue
+
+                track_id = track.track_id
+                bbox = track.to_tlbr().astype(int)
+                center = track.get_center().astype(int)
+
+                action = 'pending'
+                clr = (0, 255, 0)
+                # Use 30 frames time-steps to prediction.
+                if len(track.keypoints_list) == 30:
+                    pts = np.array(track.keypoints_list, dtype=np.float32)
+                    out = action_model.predict(pts, frame.shape[:2])
+                    action_name = action_model.class_names[out[0].argmax()]
+                    action = '{}: {:.2f}%'.format(action_name, out[0].max() * 100)
+                    if action_name == 'Fall Down':
+                        clr = (255, 0, 0)
+                    elif action_name == 'Lying Down':
+                        clr = (255, 200, 0)
+
+                # Add action to action list.
+                self.ACTION_LIST.append(action)
+
+                # VISUALIZE.
+                if track.time_since_update == 0:
+                    if SHOW_SKELETON:
+                        frame = draw_single(frame, track.keypoints_list[-1])
+                    frame = cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 1)
+                    frame = cv2.putText(frame, str(track_id), (center[0], center[1]), cv2.FONT_HERSHEY_COMPLEX,
+                                        0.4, (255, 0, 0), 2)
+                    frame = cv2.putText(frame, action, (bbox[0] + 5, bbox[1] + 15), cv2.FONT_HERSHEY_COMPLEX,
+                                        0.4, clr, 1)
+
+            # Show Frame.
+            frame = cv2.resize(frame, (0, 0), fx=2., fy=2.)
+            frame = cv2.putText(frame, '%d, FPS: %f' % (f, 1.0 / (time.time() - fps_time)),
+                                (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
+            frame = frame[:, :, ::-1]
+            fps_time = time.time()
+
+            # return frame for video streaming
+            ret, buffer = cv2.imencode('.jpg', frame)
+            if not ret:
+                # If encoding fails, raise an error to stop the streaming
+                raise HTTPException(status_code=500, detail="Frame encoding failed")
+            yield (b'--frame\r\n'
+                b'Content-Type: image/jpeg\r\n\r\n' + buffer.tobytes() + b'\r\n')
diff --git a/StreamServer/src/routers/video.py b/StreamServer/src/routers/video.py
index 8ccfbd8..2f27df3 100644
--- a/StreamServer/src/routers/video.py
+++ b/StreamServer/src/routers/video.py
@@ -16,7 +16,7 @@ from config import TEMP_VIDEO_FILE, VIDEO_BUCKET
 from scheme import Camera
 from utils import save_to_config, read_cameras_from_config
 
-from analytic.action.action_model import generate_action_model_frame
+from analytic.action.action_model import ActionModel
 
 
 jobstores = {
@@ -24,6 +24,8 @@ jobstores = {
 }
 scheduler = AsyncIOScheduler(jobstores=jobstores, timezone='Asia/Bangkok')
 
+action_model = ActionModel()
+
 @asynccontextmanager
 async def lifespan(application: FastAPI):
     scheduler.start()
@@ -127,7 +129,7 @@ async def stream_action_video(camera_id: int) -> StreamingResponse:
     if not cap.isOpened():
         raise HTTPException(status_code=404, detail="Camera is closed or not available")
 
-    return StreamingResponse(generate_action_model_frame(camera.link), media_type="multipart/x-mixed-replace; boundary=frame")
+    return StreamingResponse(action_model.generate_action_model_frame(camera.link), media_type="multipart/x-mixed-replace; boundary=frame")
 
 
 @router.delete("/remove/{camera_id}", response_model=dict)