PaddlePaddle · Xiaobin-Lu · Jan 14, 2025 · Jan 14, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/deploy/python/det_keypoint_unite_infer.py b/deploy/python/det_keypoint_unite_infer.py
@@ -31,7 +31,8 @@
 
 KEYPOINT_SUPPORT_MODELS = {
     'HigherHRNet': 'keypoint_bottomup',
-    'HRNet': 'keypoint_topdown'
+    'HRNet': 'keypoint_topdown',
+    'VitPose_TopDown_WholeBody': 'keypoint_topdown_wholebody'
 }
 
 
@@ -177,9 +178,10 @@ def topdown_unite_predict_video(detector,
                 current_keypoints)
 
             keypoint_res['keypoint'][0][0] = smooth_keypoints.tolist()
-
+
+        zero = np.zeros((height, width, 3), dtype=np.uint8)
         im = visualize_pose(
-            frame,
+            zero,
             keypoint_res,
             visual_thresh=FLAGS.keypoint_threshold,
             returnimg=True)
@@ -329,8 +331,7 @@ def main():
         enable_mkldnn=FLAGS.enable_mkldnn,
         use_dark=FLAGS.use_dark)
     keypoint_arch = topdown_keypoint_detector.pred_config.arch
-    assert KEYPOINT_SUPPORT_MODELS[
-        keypoint_arch] == 'keypoint_topdown', 'Detection-Keypoint unite inference only supports topdown models.'
+    assert KEYPOINT_SUPPORT_MODELS[keypoint_arch] == 'keypoint_topdown' or KEYPOINT_SUPPORT_MODELS[keypoint_arch] == 'keypoint_topdown_wholebody', 'Detection-Keypoint unite inference only supports topdown models.'
 
     # predict from video file or camera video stream
     if FLAGS.video_file is not None or FLAGS.camera_id != -1:

diff --git a/deploy/python/infer.py b/deploy/python/infer.py
@@ -34,7 +34,7 @@
 from benchmark_utils import PaddleInferBenchmark
 from picodet_postprocess import PicoDetPostProcess
 from preprocess import preprocess, Resize, NormalizeImage, Permute, PadStride, LetterBoxResize, WarpAffine, Pad, decode_image, CULaneResize
-from keypoint_preprocess import EvalAffine, TopDownEvalAffine, expand_crop
+from keypoint_preprocess import EvalAffine, TopDownEvalAffine, TopDownAffineImage, expand_crop
 from clrnet_postprocess import CLRNetPostProcess
 from visualize import visualize_box_mask, imshow_lanes
 from utils import argsparser, Timer, get_current_memory_mb, multiclass_nms, coco_clsid2catid

diff --git a/deploy/python/keypoint_infer.py b/deploy/python/keypoint_infer.py
@@ -42,10 +42,40 @@
 # Global dictionary
 KEYPOINT_SUPPORT_MODELS = {
     'HigherHRNet': 'keypoint_bottomup',
-    'HRNet': 'keypoint_topdown'
+    'HRNet': 'keypoint_topdown',
+    'VitPose_TopDown_WholeBody': 'keypoint_topdown_wholebody'
 }
 
 
+def _box2cs(image_size, box):
+    """This encodes bbox(x,y,w,h) into (center, scale)
+
+    Args:
+        x, y, w, h
+
+    Returns:
+        tuple: A tuple containing center and scale.
+
+        - np.ndarray[float32](2,): Center of the bbox (x, y).
+        - np.ndarray[float32](2,): Scale of the bbox w & h.
+    """
+
+    x, y, w, h = box[:4]
+    input_size = image_size
+    aspect_ratio = input_size[0] / input_size[1]
+    center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+
+    if w > aspect_ratio * h:
+        h = w * 1.0 / aspect_ratio
+    elif w < aspect_ratio * h:
+        w = h * aspect_ratio
+
+    # pixel std is 200.0
+    scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+    scale = scale * 1.25
+
+    return center, scale
+
 class KeyPointDetector(Detector):
     """
     Args:
@@ -137,6 +167,23 @@ def postprocess(self, inputs, result):
             imshape = inputs['im_shape'][:, ::-1]
             center = np.round(imshape / 2.)
             scale = imshape / 200.
+            keypoint_postprocess = HRNetPostProcess(use_dark=self.use_dark)
+            kpts, scores = keypoint_postprocess(np_heatmap, center, scale)
+            results['keypoint'] = kpts
+            results['score'] = scores
+            return results
+        elif KEYPOINT_SUPPORT_MODELS[
+                self.pred_config.arch] == 'keypoint_topdown_wholebody':
+            results = {}
+            imshape = inputs['im_shape'][:, ::-1]
+            center = []
+            scale = []
+            for i in range(len(inputs['im_shape'])):
+                transize = np.shape(inputs["image"])
+                tmp_center, tmp_scale = _box2cs([np.shape(inputs["image"])[-1],np.shape(inputs["image"])[-2]], [0,0,inputs['im_shape'][i][1],inputs['im_shape'][i][0]] )
+                center.append(tmp_center)
+                scale.append(tmp_scale)
+
             keypoint_postprocess = HRNetPostProcess(use_dark=self.use_dark)
             kpts, scores = keypoint_postprocess(np_heatmap, center, scale)
             results['keypoint'] = kpts

diff --git a/deploy/python/keypoint_preprocess.py b/deploy/python/keypoint_preprocess.py
@@ -18,6 +18,83 @@
 import numpy as np
 
 
+def _box2cs(image_size, box):
+    """This encodes bbox(x,y,w,h) into (center, scale)
+
+    Args:
+        x, y, w, h
+
+    Returns:
+        tuple: A tuple containing center and scale.
+
+        - np.ndarray[float32](2,): Center of the bbox (x, y).
+        - np.ndarray[float32](2,): Scale of the bbox w & h.
+    """
+
+    x, y, w, h = box[:4]
+    input_size = image_size
+    aspect_ratio = input_size[0] / input_size[1]
+    center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+
+    if w > aspect_ratio * h:
+        h = w * 1.0 / aspect_ratio
+    elif w < aspect_ratio * h:
+        w = h * aspect_ratio
+
+    # pixel std is 200.0
+    scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+    scale = scale * 1.25
+
+    return center, scale
+
+class TopDownAffineImage(object):
+    """apply affine transform to image and coords
+
+    Args:
+        trainsize (list): [w, h], the standard size used to train
+        use_udp (bool): whether to use Unbiased Data Processing.
+        records(dict): the dict contained the image and coords
+
+    Returns:
+        records (dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self, trainsize, use_udp=False, use_box2cs=True):
+        self.trainsize = trainsize
+        self.use_udp = use_udp
+        self.use_box2cs = use_box2cs
+
+    def __call__(self, records, im_info):
+        if self.use_box2cs:
+            center, scale = _box2cs(self.trainsize, [0,0,im_info['im_shape'][1],im_info['im_shape'][0]]) 
+        else:
+            imshape = im_info['im_shape'][::-1]
+            center = im_info['center'] if 'center' in im_info else imshape / 2.
+            scale = im_info['scale'] if 'scale' in im_info else imshape
+
+        image = records
+        rot = records['rotate'] if "rotate" in records else 0
+        if self.use_udp:
+            trans = get_warp_matrix(
+                rot, center * 2.0,
+                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],
+                scale * 200.0)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+            joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(), trans)
+        else:
+            trans = get_affine_transform(center, scale *
+                                         200, rot, self.trainsize)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+        return image, im_info
+
+
 class EvalAffine(object):
     def __init__(self, size, stride=64):
         super(EvalAffine, self).__init__()

diff --git a/deploy/python/visualize.py b/deploy/python/visualize.py
@@ -20,6 +20,7 @@
 import numpy as np
 import PIL
 from PIL import Image, ImageDraw, ImageFile
+
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 
 def imagedraw_textsize_c(draw, text):
@@ -234,14 +235,14 @@ def get_color(idx):
     color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255)
     return color
 
-
 def visualize_pose(imgfile,
                    results,
                    visual_thresh=0.6,
                    save_name='pose.jpg',
                    save_dir='output',
                    returnimg=False,
-                   ids=None):
+                   ids=None,
+                   draw_box=False):
     try:
         import matplotlib.pyplot as plt
         import matplotlib
@@ -252,30 +253,36 @@ def visualize_pose(imgfile,
         raise e
     skeletons, scores = results['keypoint']
     skeletons = np.array(skeletons)
-    kpt_nums = 17
+    kpt_nums = np.shape(skeletons)[1]
     if len(skeletons) > 0:
         kpt_nums = skeletons.shape[1]
     if kpt_nums == 17:  #plot coco keypoint
         EDGES = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8),
                  (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14),
                  (13, 15), (14, 16), (11, 12)]
+    elif kpt_nums == 133:
+        EDGES = [(15, 13), (13, 11), (16, 14), (14, 12), (11, 12), (5, 11), (6, 12), (5, 6), (5, 7), (6, 8), (7, 9), (8, 10), (1, 2), (0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (15, 17), (15, 18), (15, 19), (16, 20), (16, 21), (16, 22), (91, 92), (92, 93), (93, 94), (94, 95), (91, 96), (96, 97), (97, 98), (98, 99), (91, 100), (100, 101), (101, 102), (102, 103), (91, 104), (104, 105), (105, 106), (106, 107), (91, 108), (108, 109), (109, 110), (110, 111), (112, 113), (113, 114), (114, 115), (115, 116), (112, 117), (117, 118), (118, 119), (119, 120), (112, 121), (121, 122), (122, 123), (123, 124), (112, 125), (125, 126), (126, 127), (127, 128), (112, 129), (129, 130), (130, 131), (131, 132)]
+
     else:  #plot mpii keypoint
         EDGES = [(0, 1), (1, 2), (3, 4), (4, 5), (2, 6), (3, 6), (6, 7), (7, 8),
                  (8, 9), (10, 11), (11, 12), (13, 14), (14, 15), (8, 12),
                  (8, 13)]
     NUM_EDGES = len(EDGES)
-
-    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
-            [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
-            [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
+    if kpt_nums == 133:
+        colors = [(51, 153, 255), (51, 153, 255), (51, 153, 255), (51, 153, 255), (51, 153, 255), (0, 255, 0), (255, 128, 0), (0, 255, 0), (255, 128, 0), (0, 255, 0), (255, 128, 0), (0, 255, 0), (255, 128, 0), (0, 255, 0), (255, 128, 0), (0, 255, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 255, 255), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 153, 255), (255, 153, 255), (255, 153, 255), (255, 153, 255), (102, 178, 255), (102, 178, 255), (102, 178, 255), (102, 178, 255), (255, 51, 51), (255, 51, 51), (255, 51, 51), (255, 51, 51), (0, 255, 0), (0, 255, 0), (0, 255, 0), (0, 255, 0), (255, 255, 255), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 153, 255), (255, 153, 255), (255, 153, 255), (255, 153, 255), (102, 178, 255), (102, 178, 255), (102, 178, 255), (102, 178, 255), (255, 51, 51), (255, 51, 51), (255, 51, 51), (255, 51, 51), (0, 255, 0), (0, 255, 0), (0, 255, 0), (0, 255, 0)]
+    else:
+        colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
+                [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
+                [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
+    skeleton_link_colors = [(0, 255, 0), (0, 255, 0), (255, 128, 0), (255, 128, 0), (51, 153, 255), (51, 153, 255), (51, 153, 255), (51, 153, 255), (0, 255, 0), (255, 128, 0), (0, 255, 0), (255, 128, 0), (51, 153, 255), (51, 153, 255), (51, 153, 255), (51, 153, 255), (51, 153, 255), (51, 153, 255), (51, 153, 255), (0, 255, 0), (0, 255, 0), (0, 255, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 153, 255), (255, 153, 255), (255, 153, 255), (255, 153, 255), (102, 178, 255), (102, 178, 255), (102, 178, 255), (102, 178, 255), (255, 51, 51), (255, 51, 51), (255, 51, 51), (255, 51, 51), (0, 255, 0), (0, 255, 0), (0, 255, 0), (0, 255, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 128, 0), (255, 153, 255), (255, 153, 255), (255, 153, 255), (255, 153, 255), (102, 178, 255), (102, 178, 255), (102, 178, 255), (102, 178, 255), (255, 51, 51), (255, 51, 51), (255, 51, 51), (255, 51, 51), (0, 255, 0), (0, 255, 0), (0, 255, 0), (0, 255, 0)]
     cmap = matplotlib.cm.get_cmap('hsv')
     plt.figure()
 
     img = cv2.imread(imgfile) if type(imgfile) == str else imgfile
 
     color_set = results['colors'] if 'colors' in results else None
 
-    if 'bbox' in results and ids is None:
+    if 'bbox' in results and ids is None and draw_box:
         bboxs = results['bbox']
         for j, rect in enumerate(bboxs):
             xmin, ymin, xmax, ymax = rect
@@ -325,7 +332,7 @@ def visualize_pose(imgfile,
                                        (int(length / 2), stickwidth),
                                        int(angle), 0, 360, 1)
             if ids is None:
-                color = colors[i] if color_set is None else colors[color_set[j]
+                color = skeleton_link_colors[i] if color_set is None else colors[color_set[j]
                                                                    %
                                                                    len(colors)]
             else:

diff --git a/ppdet/engine/export_utils.py b/ppdet/engine/export_utils.py
@@ -55,10 +55,11 @@
     'YOLOF': 40,
     'METRO_Body': 3,
     'DETR': 3,
-    'CLRNet': 3
+    'CLRNet': 3,
+    'VitPose_TopDown_WholeBody': 3
 }
 
-KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet']
+KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet', 'VitPose_TopDown_WholeBody']
 MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
 LANE_ARCH = ['CLRNet']
 

diff --git a/ppdet/modeling/architectures/__init__.py b/ppdet/modeling/architectures/__init__.py
@@ -26,6 +26,7 @@
 from . import keypoint_hrhrnet
 from . import keypoint_hrnet
 from . import keypoint_vitpose
+from . import keypoint_vitpose_wholebody
 from . import jde
 from . import deepsort
 from . import fairmot
@@ -61,6 +62,7 @@
 from .keypoint_hrhrnet import *
 from .keypoint_hrnet import *
 from .keypoint_vitpose import *
+from .keypoint_vitpose_wholebody import *
 from .jde import *
 from .deepsort import *
 from .fairmot import *