Feature: Padding Input image to square improves face detection

Question

Feature: Padding Input image to square improves face detection

vladmandic opened this issue 3 months ago · 1 comments

Discussed in #468

^{Originally posted by StillTravelling May 20, 2024}
I'm not sure if this is a bug or not but in order for face detection to work consistently I'm having to pad out a HD image. The incoming video is 25 fps. Using human 3.2.2 on node.

Here's my code:

const Human = require('C:\\TestNode\\node_modules\\@vladmandic\\human\\dist\\human.node-gpu.js').default;
const humanConfig = {
	modelBasePath: 'file://hmodels',
	debug: false,
	async: true,
	filter: { enabled: false },
	cacheSensitivity : 0.9,
	//skipAllowed: true,
	//skipFrames: 200,

	
	face: {
	  enabled: true,
	  detector: { enabled: true, maxDetected: 1,  rotation: false, minConfidence: 0.8 },
	  attention: { enabled: false },
	  mesh: { enabled: false },
	  iris: { enabled: false },
	  description: { enabled: false },
	  emotion: { enabled: false },
	  antispoof: { enabled: false },
	  liveness: { enabled: false },
	},
	gesture: { enabled: false },
	hand: { enabled: false },
	body: { enabled: false },
	object: { enabled: false },
	segmentation: { enabled: false}
  };

const human = new Human(humanConfig);
...
async function tensorToBuffer(tensor) {
	const data = await tensor.data();
	const buffer = Buffer.from(data);
	return buffer;
}

function resizeImage(imageTensor, width, height) {
	return human.tf.image.resizeBilinear(imageTensor, [height, width]);
}
  
  // Function to extract a region from an image tensor
  function extractImage(imageTensor, left, top, width, height) {
	return imageTensor.slice([top, left, 0], [height, width, -1]);
  }

  function padImage(imageTensor, targetWidth, targetHeight) {
	const [height, width, channels] = imageTensor.shape;
	const top = Math.floor((targetHeight - height) / 2);
	const bottom = targetHeight - height - top;
	const left = Math.floor((targetWidth - width) / 2);
	const right = targetWidth - width - left;
  
	return human.tf.pad(imageTensor, [[top, bottom], [left, right], [0, 0]]);
  }

let lastbox; 
let facedetect = false;


async function processImage2(frameToSend, arW, arH, fW, fH){
	let extractedImage;

	const tensor = await human.tf.node.decodeJpeg(frameToSend, 3); // decode jpeg buffer to raw tensor
	
	let current_f = img_dim2;
	let current_ar = use_aspect_ratio; //set it here because use_aspect_ratio might change mid processing when received as msg

	if(current_ar == 'cover'){
		extractedImage = tensor;
	}
	else if (['32', '64', '128', '256','512'].includes(current_ar))  //select the center 32 pixels of the image
	{
		let nv = parseInt(current_ar);
		extractedImage = await human.tf.tidy(() => extractImage(tensor, (arW-nv) / 2, (arH-nv) / 2, nv, nv));
	}
	else if (['32t', '64t', '128t', '256t','512t'].includes(current_ar))  //select the center 32 pixels offset height of the image 
	{
			let nv = parseInt(current_ar.replace('t','')); 
			let nvt = nv*2;
			extractedImage = await human.tf.tidy(() => extractImage(tensor, (arW-nv) / 2, (arH-nvt) / 2, nv, nv));
	}
	else if(current_ar.includes('face')){

		if(busy ) {
			human.tf.dispose(tensor);
			console.log("Busy");
			return;
		}
		busy = true;

		let nW = arW / current_f; //arW = 1920 current_f = 4 nW = 480
		let nH = arH / current_f; //arH = 1080 current_f = 4 nH = 270

		const tensor_r = await human.tf.tidy(() => resizeImage(tensor, nW,nH)); //resize to improve performance?
		const tensor_b = await human.tf.tidy(() => padImage(tensor_r, nW,nW)); //pad to a square to improve face rec -- this definitely improve detection
		
		human.tf.dispose(tensor_r);
		
		let res;
		if(current_ar == 'faceInterpolated'){
			const res1 = await human.detect(tensor_b);
			res = await human.next(res1);
		}
		else{
			res = await human.detect(tensor_b);
		}

		//console.log(human.tf.engine().memory());
		//console.log(res.performance);

		if (res?.face?.[0]){

			facedetect = true;
			thebox = res.face[0].box

			let left = thebox[0];
			let top = thebox[1];
			let width2 = thebox[2] ;
			let height2 = thebox[3];

			lastbox = thebox;
	
			// Ensure the coordinates and dimensions are within the bounds of the original image
			if (left < 0) left = 0;
			if (top < 0) top = 0;
			if (left + thebox[2] > nW) width2 = nW - left;
			if (top + thebox[3] > nW) height2 = nW - top;

			extractedImage = await human.tf.tidy(() => extractImage(tensor_b, left, top, width2, height2));
			human.tf.dispose(tensor_b);
		}

		else{
			facedetect = false;
			extractedImage = tensor;
		}
		
		busy = false;
	}
	else{ //contain
		extractedImage = tensor;
	}


	const exBuffer = await tensorToBuffer(extractedImage); //set to buffer so can be used by sharp
	sharpImage = sharp(exBuffer, { raw: { width: extractedImage.shape[1], height: extractedImage.shape[0], channels: 3 }}); //export as sharp to resize later as tf resize bilinear seems to be horrible
	human.tf.dispose(tensor);
	human.tf.dispose(extractedImage); 

	return sharpImage;

}

Is this is a bug? I can't find anywhere in the documentation the input image should be cropped or padded into a square?
Finally performance is great when a face is found, but when a face isn't found, performance drops from 25fps to about 17fps.