
How to improve the performance of the object detection example in the Android sample?

libofei2004 opened this issue · 2 comments

I trained ssd_mobilenet_v2 object detection model by https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_on_mobile_tf2.md and https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_detection_zoo.md
and convert to tflite to run on android app
The object detection processing time is approximately 30ms above.
I have used nnapi to increase speed, I hope to achieve an object detection time of less than 30ms . What methods can I use to further improve the detection speed?

several possible ways

  1. check if all the ops of your converted model are delegated to NNAPI (I mean check if it is really accelerated by NNAPI),
  2. If your converted tflite model includes post-processing NMS part, it's not supported by NNAPI,
  3. quantized your model (https://ai.google.dev/edge/litert/models/model_optimization) might help
  4. nowadays few new devices are shipped with NNAPI drivers (as far as I tell, only Google Pixel chips and MediaTek SoCs are still carrying NNAPI drivers for underlying NPUs)

@freedomtan thank you. I don't understand 1, how to check if it is really accelerated by NNAPI? My model has been quantized.
My main code is:

 * Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *             http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package org.tensorflow.lite.examples.objectdetection

import android.content.Context
import android.graphics.Bitmap
import android.os.SystemClock
import android.util.Log
import org.tensorflow.lite.gpu.CompatibilityList
import org.tensorflow.lite.support.image.ImageProcessor
import org.tensorflow.lite.support.image.TensorImage
import org.tensorflow.lite.support.image.ops.Rot90Op
import org.tensorflow.lite.task.core.BaseOptions
import org.tensorflow.lite.task.vision.detector.Detection
import org.tensorflow.lite.task.vision.detector.ObjectDetector

class ObjectDetectorHelper(
  var threshold: Float = 0.2f, //0.5f,
  var numThreads: Int = 2,
  var maxResults: Int = 1,
  var currentDelegate: Int = 2,
  var currentModel: Int = 0,
  val context: Context,
  val objectDetectorListener: DetectorListener?
) {

    // For this example this needs to be a var so it can be reset on changes. If the ObjectDetector
    // will not change, a lazy val would be preferable.
    private var objectDetector: ObjectDetector? = null

    init {

    fun clearObjectDetector() {
        objectDetector = null

    // Initialize the object detector using current settings on the
    // thread that is using it. CPU and NNAPI delegates can be used with detectors
    // that are created on the main thread and used on a background thread, but
    // the GPU delegate needs to be used on the thread that initialized the detector
    fun setupObjectDetector() {

        // Create the base options for the detector using specifies max results and score threshold
        val optionsBuilder =

        // Set general detection options, including number of used threads
        val baseOptionsBuilder = BaseOptions.builder().setNumThreads(numThreads)

        // Use the specified hardware for running the model. Default to CPU
        when (currentDelegate) {
            DELEGATE_CPU -> {
                // Default
            DELEGATE_GPU -> {
                if (CompatibilityList().isDelegateSupportedOnThisDevice) {
                } else {
                    objectDetectorListener?.onError("GPU is not supported on this device")
            DELEGATE_NNAPI -> {


        val modelName =
            when (currentModel) {
                MODEL_MOBILENETV1 -> "mobilenetv2_2.tflite"
                MODEL_EFFICIENTDETV0 -> "efficientdet-lite0.tflite"
                MODEL_EFFICIENTDETV1 -> "efficientdet-lite1.tflite"
                MODEL_EFFICIENTDETV2 -> "efficientdet-lite2.tflite"
                MODEL_SHOT -> "saved_model_meta.tflite"
                else -> "mobilenetv1.tflite"

        try {
            objectDetector =
                ObjectDetector.createFromFileAndOptions(context, modelName, optionsBuilder.build())
        } catch (e: IllegalStateException) {
                "Object detector failed to initialize. See error logs for details"
            Log.e("Test", "TFLite failed to load model with error: " + e.message)

    fun detect(image: Bitmap, imageRotation: Int) {
        if (objectDetector == null) {

        // Inference time is the difference between the system time at the start and finish of the
        // process
        var inferenceTime = SystemClock.uptimeMillis()

        // Create preprocessor for the image.
        // See https://www.tensorflow.org/lite/inference_with_metadata/
        //            lite_support#imageprocessor_architecture
        val imageProcessor =
                .add(Rot90Op(-imageRotation / 90))

        // Preprocess the image and convert it into a TensorImage for detection.
        val tensorImage = imageProcessor.process(TensorImage.fromBitmap(image))

        val results = objectDetector?.detect(tensorImage)
        inferenceTime = SystemClock.uptimeMillis() - inferenceTime
        Log.e("inferenceTime", "inferenceTime: " + inferenceTime+"ms")

    interface DetectorListener {
        fun onError(error: String)
        fun onResults(
          results: MutableList<Detection>?,
          inferenceTime: Long,
          imageHeight: Int,
          imageWidth: Int

    companion object {
        const val DELEGATE_CPU = 0
        const val DELEGATE_GPU = 1
        const val DELEGATE_NNAPI = 2
        const val MODEL_MOBILENETV1 = 0
        const val MODEL_EFFICIENTDETV0 = 1
        const val MODEL_EFFICIENTDETV1 = 2
        const val MODEL_EFFICIENTDETV2 = 3
        const val MODEL_SHOT = 4