Coral Edge TPU USB のサンプルの Object Detection を試す。#3

Coral Edge TPU USB のサンプルの Object Detection が公開されていたので、
試してみました。の#3 です。

https://github.com/google-coral/examples-camera

テスト環境
Raspberry Pi 3B+
Coral Edge TPU USB
Python3.7
Tensorflow 1.15.0
Raspi カメラ

前回のプログラムでは、Raspi で実行させると、発声のタイミングで、Stream バッファに画像が貯まるので、声が出た後の画像が、
実際のカメラの前の映像と一致しない問題が出ました。

OpenCV でのStream バッファのクリア自体は用意されていなくて、代替手段として、cap.read() または、 cap.grab() を5回繰り返して、
受け捨てする方法が、Web に出ていたので試してみました。が、

結論としては、余り良くないので、 picamera stream to OpenCV が出来ないか、Webで検索していたら、
丁度良いサンプルが在ったので、それを取り入れてみました。

Accessing the Raspberry Pi Camera with OpenCV and Python

結局は、下記、が元ネタみたいです。
色々な使い方が、出来るようです。

https://picamera.readthedocs.io/en/release-1.13/recipes2.html

前置きが長くなりましたが、プログラムは下記になります。

detect-speaker3.py
# -*- coding:utf-8 -*- # Copyright 2019 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """A demo that runs object detection on camera frames using OpenCV. TEST_DATA=../all_models Run coco model: python3 detect-speaker3.py \ --model ${TEST_DATA}/mobilenet_ssd_v2_coco_quant_postprocess_edgetpu.tflite \ --labels ${TEST_DATA}/coco_labels.txt picamera stream to OpenCV sample https://www.pyimagesearch.com/2015/03/30/accessing-the-raspberry-pi-camera-with-opencv-and-python/ """ import argparse import collections import common import cv2 import numpy as np import os from PIL import Image import re import tflite_runtime.interpreter as tflite # add by nishi from timeit import default_timer as timer import simpleaudio as sa import wave import time from picamera.array import PiRGBArray from picamera import PiCamera Object = collections.namedtuple('Object', ['id', 'score', 'bbox']) def load_labels(path): p = re.compile(r'\s*(\d+)(.+)') with open(path, 'r', encoding='utf-8') as f: lines = (p.match(line).groups() for line in f.readlines()) return {int(num): text.strip() for num, text in lines} class BBox(collections.namedtuple('BBox', ['xmin', 'ymin', 'xmax', 'ymax'])): """Bounding box. Represents a rectangle which sides are either vertical or horizontal, parallel to the x or y axis. """ __slots__ = () def get_output(interpreter, score_threshold, top_k, image_scale=1.0): """Returns list of detected objects.""" boxes = common.output_tensor(interpreter, 0) class_ids = common.output_tensor(interpreter, 1) scores = common.output_tensor(interpreter, 2) count = int(common.output_tensor(interpreter, 3)) def make(i): ymin, xmin, ymax, xmax = boxes[i] return Object( id=int(class_ids[i]), score=scores[i], bbox=BBox(xmin=np.maximum(0.0, xmin), ymin=np.maximum(0.0, ymin), xmax=np.minimum(1.0, xmax), ymax=np.minimum(1.0, ymax))) return [make(i) for i in range(top_k) if scores[i] >= score_threshold] def load_wave_tb(wav_dt): wav_tb={} for (id,path) in wav_dt.items(): wave_read = wave.open(path, 'rb') audio_data = wave_read.readframes(wave_read.getnframes()) num_channels = wave_read.getnchannels() bytes_per_sample = wave_read.getsampwidth() sample_rate = wave_read.getframerate() fs={'dt':audio_data,'num_channels':num_channels,'bytes_per_sample':bytes_per_sample,'sample_rate':sample_rate} wav_tb[id]=fs # cat=16 , dog=17 , cup=46 return wav_tb def main(): default_model_dir = '../all_models' default_model = 'mobilenet_ssd_v2_coco_quant_postprocess_edgetpu.tflite' default_labels = 'coco_labels.txt' parser = argparse.ArgumentParser() parser.add_argument('--model', help='.tflite model path', default=os.path.join(default_model_dir,default_model)) parser.add_argument('--labels', help='label file path', default=os.path.join(default_model_dir, default_labels)) parser.add_argument('--top_k', type=int, default=3, help='number of categories with highest score to display') parser.add_argument('--camera_idx', type=int, help='Index of which video source to use. ', default = 0) parser.add_argument('--threshold', type=float, default=0.5, help='classifier score threshold') args = parser.parse_args() # cat=16, dog=17, cup=46 wav_dt={16:'neko.wav',17:'inu.wav',46:'kop.wav'} wav_tb=load_wave_tb(wav_dt) if False: fs=wav_tb[46] play_obj = sa.play_buffer(fs['dt'], fs['num_channels'], fs['bytes_per_sample'], fs['sample_rate']) play_obj.wait_done() exit() print('Loading {} with {} labels.'.format(args.model, args.labels)) interpreter = common.make_interpreter(args.model) interpreter.allocate_tensors() if False: input_fs = interpreter.get_input_details() output_fs = interpreter.get_output_details() print('>input_fs=',input_fs) print('>output_fs=',output_fs) labels = load_labels(args.labels) # initialize the camera and grab a reference to the raw camera capture camera = PiCamera() camera.resolution = (640, 480) camera.framerate = 32 rawCapture = PiRGBArray(camera, size=(640, 480)) # allow the camera to warmup time.sleep(0.1) #cap = cv2.VideoCapture(args.camera_idx) # test by nishi for fps accum_time = 0 curr_fps = 0 fps = "FPS: ??" prev_time = timer() #for frame in camera.capture_continuous(rawCapture, format="bgr", use_video_port=True): for frame in camera.capture_continuous(rawCapture, format="rgb", use_video_port=True): # grab the raw NumPy array representing the image, then initialize the timestamp # and occupied/unoccupied text cv2_im = frame.array #cv2_im_rgb = cv2.cvtColor(cv2_im, cv2.COLOR_BGR2RGB) cv2_im_rgb=cv2_im pil_im = Image.fromarray(cv2_im_rgb) common.set_input(interpreter, pil_im) interpreter.invoke() objs = get_output(interpreter, score_threshold=args.threshold, top_k=args.top_k) cv2_im = append_objs_to_img(cv2_im, objs, labels) # add by nishi for fps start curr_time = timer() exec_time = curr_time - prev_time prev_time = curr_time accum_time = accum_time + exec_time curr_fps = curr_fps + 1 if accum_time > 1: accum_time = accum_time - 1 fps = "FPS: " + str(curr_fps) curr_fps = 0 cv2.putText(cv2_im, text=fps, org=(3, 15), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.50, color=(255, 0, 0), thickness=2) #print('>type(cv2_im)=',type(cv2_im)) #print('>cv2_im.shape=',cv2_im.shape) #print('>cv2_im.dtype=',cv2_im.dtype) cv2.imshow('frame', cv2_im) if cv2.waitKey(1) & 0xFF == ord('q'): break obj_speaker(objs,wav_tb) # clear the stream in preparation for the next frame rawCapture.truncate(0) cap.release() cv2.destroyAllWindows() def append_objs_to_img(cv2_im, objs, labels): height, width, channels = cv2_im.shape for obj in objs: x0, y0, x1, y1 = list(obj.bbox) x0, y0, x1, y1 = int(x0*width), int(y0*height), int(x1*width), int(y1*height) percent = int(100 * obj.score) label = '{}% {}'.format(percent, labels.get(obj.id, obj.id)) cv2_im = cv2.rectangle(cv2_im, (x0, y0), (x1, y1), (0, 255, 0), 2) cv2_im = cv2.putText(cv2_im, label, (x0, y0+30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 0, 0), 2) return cv2_im def obj_speaker(objs,wav_tb): spk_f=False for obj in objs: if obj.id in wav_tb: #print('>obj.id=',obj.id) fs=wav_tb[obj.id] play_obj = sa.play_buffer(fs['dt'], fs['num_channels'], fs['bytes_per_sample'], fs['sample_rate']) play_obj.wait_done() spk_f=True return spk_f if __name__ == '__main__': main()

サンプルのまま、Object Detection を組み込まない状態で、11 FPS 位です。
Object Detection を組み込んで、 6 ～ 7 FPS 位になりました。

この差は、やはり Coral Edge TPU (Raspi 3B+ and USB2.0 and Low performance) での遅延だと思います。
ここが、よくなれば、それでも、 11 FPS が限界でしょうか?

後、問題点としては、スピーカーに声が出た時に、その該当の画像が画面に表示されるのが、
遅くて、出ても一瞬なので、見た目、声だけ聞こえて、ボックスの画面が出ていないと思える場合があります。

どうやら、cv2.imshow('frame', cv2_im) の処理の画面の flush が後回しになっているようです。
obj_speaker(objs,wav_tb) の後で、声出しした場合は、delay でもさせた方が良いかもしれません。

結局、下記の様に修正しました。
声だしする時は、声だしの前に cv2.waitKey(2) で 2ms Wait
そうでないときは、cv2.waitKey(1) で 1ms Wait にしました。

detect-speaker3.py 改
# -*- coding:utf-8 -*- # Copyright 2019 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """A demo that runs object detection on camera frames using OpenCV. TEST_DATA=../all_models Run coco model: python3 detect-speaker3.py \ --model ${TEST_DATA}/mobilenet_ssd_v2_coco_quant_postprocess_edgetpu.tflite \ --labels ${TEST_DATA}/coco_labels.txt """ import argparse import collections import common import cv2 import numpy as np import os from PIL import Image import re import tflite_runtime.interpreter as tflite # add by nishi from timeit import default_timer as timer import simpleaudio as sa import wave import time from picamera.array import PiRGBArray from picamera import PiCamera Object = collections.namedtuple('Object', ['id', 'score', 'bbox']) def load_labels(path): p = re.compile(r'\s*(\d+)(.+)') with open(path, 'r', encoding='utf-8') as f: lines = (p.match(line).groups() for line in f.readlines()) return {int(num): text.strip() for num, text in lines} class BBox(collections.namedtuple('BBox', ['xmin', 'ymin', 'xmax', 'ymax'])): """Bounding box. Represents a rectangle which sides are either vertical or horizontal, parallel to the x or y axis. """ __slots__ = () def get_output(interpreter, score_threshold, top_k, image_scale=1.0): """Returns list of detected objects.""" boxes = common.output_tensor(interpreter, 0) class_ids = common.output_tensor(interpreter, 1) scores = common.output_tensor(interpreter, 2) count = int(common.output_tensor(interpreter, 3)) def make(i): ymin, xmin, ymax, xmax = boxes[i] return Object( id=int(class_ids[i]), score=scores[i], bbox=BBox(xmin=np.maximum(0.0, xmin), ymin=np.maximum(0.0, ymin), xmax=np.minimum(1.0, xmax), ymax=np.minimum(1.0, ymax))) return [make(i) for i in range(top_k) if scores[i] >= score_threshold] def load_wave_tb(wav_dt): wav_tb={} for (id,path) in wav_dt.items(): wave_read = wave.open(path, 'rb') audio_data = wave_read.readframes(wave_read.getnframes()) num_channels = wave_read.getnchannels() bytes_per_sample = wave_read.getsampwidth() sample_rate = wave_read.getframerate() fs={'dt':audio_data,'num_channels':num_channels,'bytes_per_sample':bytes_per_sample,'sample_rate':sample_rate} wav_tb[id]=fs # cat=16 , dog=17 , cup=46 return wav_tb def main(): default_model_dir = '../all_models' default_model = 'mobilenet_ssd_v2_coco_quant_postprocess_edgetpu.tflite' default_labels = 'coco_labels.txt' parser = argparse.ArgumentParser() parser.add_argument('--model', help='.tflite model path', default=os.path.join(default_model_dir,default_model)) parser.add_argument('--labels', help='label file path', default=os.path.join(default_model_dir, default_labels)) parser.add_argument('--top_k', type=int, default=3, help='number of categories with highest score to display') parser.add_argument('--camera_idx', type=int, help='Index of which video source to use. ', default = 0) parser.add_argument('--threshold', type=float, default=0.5, help='classifier score threshold') args = parser.parse_args() # cat=16, dog=17, cup=46 wav_dt={16:'neko.wav',17:'inu.wav',46:'kop.wav'} wav_tb=load_wave_tb(wav_dt) if False: fs=wav_tb[46] play_obj = sa.play_buffer(fs['dt'], fs['num_channels'], fs['bytes_per_sample'], fs['sample_rate']) play_obj.wait_done() exit() print('Loading {} with {} labels.'.format(args.model, args.labels)) interpreter = common.make_interpreter(args.model) interpreter.allocate_tensors() if False: input_fs = interpreter.get_input_details() output_fs = interpreter.get_output_details() print('>input_fs=',input_fs) print('>output_fs=',output_fs) labels = load_labels(args.labels) # initialize the camera and grab a reference to the raw camera capture camera = PiCamera() camera.resolution = (640, 480) camera.framerate = 32 rawCapture = PiRGBArray(camera, size=(640, 480)) # allow the camera to warmup time.sleep(0.1) #cap = cv2.VideoCapture(args.camera_idx) # test by nishi for fps accum_time = 0 curr_fps = 0 fps = "FPS: ??" prev_time = timer() #for frame in camera.capture_continuous(rawCapture, format="bgr", use_video_port=True): for frame in camera.capture_continuous(rawCapture, format="rgb", use_video_port=True): # grab the raw NumPy array representing the image, then initialize the timestamp # and occupied/unoccupied text cv2_im = frame.array #cv2_im_rgb = cv2.cvtColor(cv2_im, cv2.COLOR_BGR2RGB) cv2_im_rgb=cv2_im pil_im = Image.fromarray(cv2_im_rgb) common.set_input(interpreter, pil_im) interpreter.invoke() objs = get_output(interpreter, score_threshold=args.threshold, top_k=args.top_k) cv2_im,obj_id = append_objs_to_img2(cv2_im, objs, labels) # add by nishi for fps start curr_time = timer() exec_time = curr_time - prev_time prev_time = curr_time accum_time = accum_time + exec_time curr_fps = curr_fps + 1 if accum_time > 1: accum_time = accum_time - 1 fps = "FPS: " + str(curr_fps) curr_fps = 0 cv2.putText(cv2_im, text=fps, org=(3, 15), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.50, color=(255, 0, 0), thickness=2) #print('>type(cv2_im)=',type(cv2_im)) #print('>cv2_im.shape=',cv2_im.shape) #print('>cv2_im.dtype=',cv2_im.dtype) cv2.imshow('frame', cv2_im) if len(obj_id) >0: if cv2.waitKey(2) & 0xFF == ord('q'): break obj_speaker2(obj_id,wav_tb) else: if cv2.waitKey(1) & 0xFF == ord('q'): break # clear the stream in preparation for the next frame rawCapture.truncate(0) #cap.release() cv2.destroyAllWindows() def append_objs_to_img2(cv2_im, objs, labels): height, width, channels = cv2_im.shape obj_id=[] for obj in objs: x0, y0, x1, y1 = list(obj.bbox) x0, y0, x1, y1 = int(x0*width), int(y0*height), int(x1*width), int(y1*height) percent = int(100 * obj.score) label = '{}% {}'.format(percent, labels.get(obj.id, obj.id)) cv2_im = cv2.rectangle(cv2_im, (x0, y0), (x1, y1), (0, 255, 0), 2) cv2_im = cv2.putText(cv2_im, label, (x0, y0+30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 0, 0), 2) obj_id.append(obj.id) return cv2_im,obj_id def obj_speaker2(obj_id,wav_tb): for obj in obj_id: if obj in wav_tb: #print('>obj=',obj) fs=wav_tb[obj] play_obj = sa.play_buffer(fs['dt'], fs['num_channels'], fs['bytes_per_sample'], fs['sample_rate']) play_obj.wait_done() if __name__ == '__main__': main()

上記、修正でなんとか、感覚と一致するようになりました。
やはり、6～7 FPS は、ちょっと遅いかな。
20 FPS は欲しいかな?

これ以上の高速化には、Raspberry Pi 4 を使うしか無いぞね！！
画像サイズ= 300 x 300 でもよいのかな！！

Coral Edge TPU USB のサンプルの Object Detection を試す。#3

カテゴリ:

検索

このブログ記事について

カテゴリ

月別アーカイブ

ウェブページ

サイトナビ

Coral Edge TPU USB のサンプルの Object Detection を試す。#3

カテゴリ:

検索

このブログ記事について

カテゴリ

月別 アーカイブ

ウェブページ

サイトナビ

月別アーカイブ