From 837a8e946888c29c1a9cc05bb7824751c84d19bb Mon Sep 17 00:00:00 2001
From: Benn Snyder <benn.snyder@gmail.com>
Date: Mon, 3 Jan 2022 15:20:17 -0500
Subject: [PATCH 1/2] refactor: extract convert functions

---
 src/cameras.c | 287 +--------------------------------------------
 src/convert.h | 314 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 318 insertions(+), 283 deletions(-)
 create mode 100644 src/convert.h

diff --git a/src/cameras.c b/src/cameras.c
index a5d580a1..6ac5bc04 100644
--- a/src/cameras.c
+++ b/src/cameras.c
@@ -31,6 +31,7 @@
 
 #include "freenect_internal.h"
 #include "registration.h"
+#include "convert.h"
 #include "cameras.h"
 #include "flags.h"
 
@@ -292,88 +293,6 @@ static int stream_setbuf(freenect_context *ctx, packet_stream *strm, void *pbuf)
 	}
 }
 
-/**
- * Convert a packed array of n elements with vw useful bits into array of
- * zero-padded 16bit elements.
- *
- * @param src The source packed array, of size (n * vw / 8) bytes
- * @param dest The destination unpacked array, of size (n * 2) bytes
- * @param vw The virtual width of elements, that is the number of useful bits for each of them
- * @param n The number of elements (in particular, of the destination array), NOT a length in bytes
- */
-static inline void convert_packed_to_16bit(uint8_t *src, uint16_t *dest, int vw, int n)
-{
-	unsigned int mask = (1 << vw) - 1;
-	uint32_t buffer = 0;
-	int bitsIn = 0;
-	while (n--) {
-		while (bitsIn < vw) {
-			buffer = (buffer << 8) | *(src++);
-			bitsIn += 8;
-		}
-		bitsIn -= vw;
-		*(dest++) = (buffer >> bitsIn) & mask;
-	}
-}
-
-/**
- * Convert a packed array of n elements with vw useful bits into array of
- * 8bit elements, dropping LSB.
- *
- * @param src The source packed array, of size (n * vw / 8) bytes
- * @param dest The destination unpacked array, of size (n * 2) bytes
- * @param vw The virtual width of elements, that is the number of useful bits for each of them
- * @param n The number of elements (in particular, of the destination array), NOT a length in bytes
- *
- * @pre vw is expected to be >= 8.
- */
-static inline void convert_packed_to_8bit(uint8_t *src, uint8_t *dest, int vw, int n)
-{
-	uint32_t buffer = 0;
-	int bitsIn = 0;
-	while (n--) {
-		while (bitsIn < vw) {
-			buffer = (buffer << 8) | *(src++);
-			bitsIn += 8;
-		}
-		bitsIn -= vw;
-		*(dest++) = buffer >> (bitsIn + vw - 8);
-	}
-}
-
-// Loop-unrolled version of the 11-to-16 bit unpacker.  n must be a multiple of 8.
-static void convert_packed11_to_16bit(uint8_t *raw, uint16_t *frame, int n)
-{
-	uint16_t baseMask = (1 << 11) - 1;
-	while(n >= 8)
-	{
-		uint8_t r0  = *(raw+0);
-		uint8_t r1  = *(raw+1);
-		uint8_t r2  = *(raw+2);
-		uint8_t r3  = *(raw+3);
-		uint8_t r4  = *(raw+4);
-		uint8_t r5  = *(raw+5);
-		uint8_t r6  = *(raw+6);
-		uint8_t r7  = *(raw+7);
-		uint8_t r8  = *(raw+8);
-		uint8_t r9  = *(raw+9);
-		uint8_t r10 = *(raw+10);
-
-		frame[0] =  (r0<<3)  | (r1>>5);
-		frame[1] = ((r1<<6)  | (r2>>2) )           & baseMask;
-		frame[2] = ((r2<<9)  | (r3<<1) | (r4>>7) ) & baseMask;
-		frame[3] = ((r4<<4)  | (r5>>4) )           & baseMask;
-		frame[4] = ((r5<<7)  | (r6>>1) )           & baseMask;
-		frame[5] = ((r6<<10) | (r7<<2) | (r8>>6) ) & baseMask;
-		frame[6] = ((r8<<5)  | (r9>>3) )           & baseMask;
-		frame[7] = ((r9<<8)  | (r10)   )           & baseMask;
-
-		n -= 8;
-		raw += 11;
-		frame += 8;
-	}
-}
-
 static void depth_process(freenect_device *dev, uint8_t *pkt, int len)
 {
 	freenect_context *ctx = dev->parent;
@@ -394,7 +313,7 @@ static void depth_process(freenect_device *dev, uint8_t *pkt, int len)
 
 	switch (dev->depth_format) {
 		case FREENECT_DEPTH_11BIT:
-			convert_packed11_to_16bit(dev->depth.raw_buf, (uint16_t*)dev->depth.proc_buf, 640*480);
+			convert_packed_to_16bit(dev->depth.raw_buf, (uint16_t*)dev->depth.proc_buf, 11, 640*480);
 			break;
 		case FREENECT_DEPTH_REGISTERED:
 			freenect_apply_registration(dev, dev->depth.raw_buf, (uint16_t*)dev->depth.proc_buf, false);
@@ -416,204 +335,6 @@ static void depth_process(freenect_device *dev, uint8_t *pkt, int len)
 		dev->depth_cb(dev, dev->depth.proc_buf, dev->depth.timestamp);
 }
 
-#define CLAMP(x) if (x < 0) {x = 0;} if (x > 255) {x = 255;}
-static void convert_uyvy_to_rgb(uint8_t *raw_buf, uint8_t *proc_buf, freenect_frame_mode frame_mode)
-{
-	int x, y;
-	for(y = 0; y < frame_mode.height; ++y) {
-		for(x = 0; x < frame_mode.width; x+=2) {
-			int i = (frame_mode.width * y + x);
-			int u  = raw_buf[2*i];
-			int y1 = raw_buf[2*i+1];
-			int v  = raw_buf[2*i+2];
-			int y2 = raw_buf[2*i+3];
-			int r1 = (y1-16)*1164/1000 + (v-128)*1596/1000;
-			int g1 = (y1-16)*1164/1000 - (v-128)*813/1000 - (u-128)*391/1000;
-			int b1 = (y1-16)*1164/1000 + (u-128)*2018/1000;
-			int r2 = (y2-16)*1164/1000 + (v-128)*1596/1000;
-			int g2 = (y2-16)*1164/1000 - (v-128)*813/1000 - (u-128)*391/1000;
-			int b2 = (y2-16)*1164/1000 + (u-128)*2018/1000;
-			CLAMP(r1)
-			CLAMP(g1)
-			CLAMP(b1)
-			CLAMP(r2)
-			CLAMP(g2)
-			CLAMP(b2)
-			proc_buf[3*i]  =r1;
-			proc_buf[3*i+1]=g1;
-			proc_buf[3*i+2]=b1;
-			proc_buf[3*i+3]=r2;
-			proc_buf[3*i+4]=g2;
-			proc_buf[3*i+5]=b2;
-		}
-	}
-}
-#undef CLAMP
-
-static void convert_bayer_to_rgb(uint8_t *raw_buf, uint8_t *proc_buf, freenect_frame_mode frame_mode)
-{
-	int x,y;
-	/* Pixel arrangement:
-	 * G R G R G R G R
-	 * B G B G B G B G
-	 * G R G R G R G R
-	 * B G B G B G B G
-	 * G R G R G R G R
-	 * B G B G B G B G
-	 *
-	 * To convert a Bayer-pattern into RGB you have to handle four pattern
-	 * configurations:
-	 * 1)         2)         3)         4)
-	 *      B1      B1 G1 B2   R1 G1 R2      R1       <- previous line
-	 *   R1 G1 R2   G2 R1 G3   G2 B1 G3   B1 G1 B2    <- current line
-	 *      B2      B3 G4 B4   R3 G4 R4      R2       <- next line
-	 *   ^  ^  ^
-	 *   |  |  next pixel
-	 *   |  current pixel
-	 *   previous pixel
-	 *
-	 * The RGB values (r,g,b) for each configuration are calculated as
-	 * follows:
-	 *
-	 * 1) r = (R1 + R2) / 2
-	 *    g =  G1
-	 *    b = (B1 + B2) / 2
-	 *
-	 * 2) r =  R1
-	 *    g = (G1 + G2 + G3 + G4) / 4
-	 *    b = (B1 + B2 + B3 + B4) / 4
-	 *
-	 * 3) r = (R1 + R2 + R3 + R4) / 4
-	 *    g = (G1 + G2 + G3 + G4) / 4
-	 *    b =  B1
-	 *
-	 * 4) r = (R1 + R2) / 2
-	 *    g =  G1
-	 *    b = (B1 + B2) / 2
-	 *
-	 * To efficiently calculate these values, two 32bit integers are used
-	 * as "shift-buffers". One integer to store the 3 horizontal bayer pixel
-	 * values (previous, current, next) of the current line. The other
-	 * integer to store the vertical average value of the bayer pixels
-	 * (previous, current, next) of the previous and next line.
-	 *
-	 * The boundary conditions for the first and last line and the first
-	 * and last column are solved via mirroring the second and second last
-	 * line and the second and second last column.
-	 *
-	 * To reduce slow memory access, the values of a rgb pixel are packet
-	 * into a 32bit variable and transfered together.
-	 */
-
-	uint8_t *dst = proc_buf; // pointer to destination
-
-	uint8_t *prevLine;        // pointer to previous, current and next line
-	uint8_t *curLine;         // of the source bayer pattern
-	uint8_t *nextLine;
-
-	// storing horizontal values in hVals:
-	// previous << 16, current << 8, next
-	uint32_t hVals;
-	// storing vertical averages in vSums:
-	// previous << 16, current << 8, next
-	uint32_t vSums;
-
-	// init curLine and nextLine pointers
-	curLine  = raw_buf;
-	nextLine = curLine + frame_mode.width;
-	for (y = 0; y < frame_mode.height; ++y) {
-
-		if ((y > 0) && (y < frame_mode.height-1))
-			prevLine = curLine - frame_mode.width; // normal case
-		else if (y == 0)
-			prevLine = nextLine;      // top boundary case
-		else
-			nextLine = prevLine;      // bottom boundary case
-
-		// init horizontal shift-buffer with current value
-		hVals  = (*(curLine++) << 8);
-		// handle left column boundary case
-		hVals |= (*curLine << 16);
-		// init vertical average shift-buffer with current values average
-		vSums = ((*(prevLine++) + *(nextLine++)) << 7) & 0xFF00;
-		// handle left column boundary case
-		vSums |= ((*prevLine + *nextLine) << 15) & 0xFF0000;
-
-		// store if line is odd or not
-		uint8_t yOdd = y & 1;
-		// the right column boundary case is not handled inside this loop
-		// thus the "639"
-		for (x = 0; x < frame_mode.width-1; ++x) {
-			// place next value in shift buffers
-			hVals |= *(curLine++);
-			vSums |= (*(prevLine++) + *(nextLine++)) >> 1;
-
-			// calculate the horizontal sum as this sum is needed in
-			// any configuration
-			uint8_t hSum = ((uint8_t)(hVals >> 16) + (uint8_t)(hVals)) >> 1;
-
-			if (yOdd == 0) {
-				if ((x & 1) == 0) {
-					// Configuration 1
-					*(dst++) = hSum;		// r
-					*(dst++) = hVals >> 8;	// g
-					*(dst++) = vSums >> 8;	// b
-				} else {
-					// Configuration 2
-					*(dst++) = hVals >> 8;
-					*(dst++) = (hSum + (uint8_t)(vSums >> 8)) >> 1;
-					*(dst++) = ((uint8_t)(vSums >> 16) + (uint8_t)(vSums)) >> 1;
-				}
-			} else {
-				if ((x & 1) == 0) {
-					// Configuration 3
-					*(dst++) = ((uint8_t)(vSums >> 16) + (uint8_t)(vSums)) >> 1;
-					*(dst++) = (hSum + (uint8_t)(vSums >> 8)) >> 1;
-					*(dst++) = hVals >> 8;
-				} else {
-					// Configuration 4
-					*(dst++) = vSums >> 8;
-					*(dst++) = hVals >> 8;
-					*(dst++) = hSum;
-				}
-			}
-
-			// shift the shift-buffers
-			hVals <<= 8;
-			vSums <<= 8;
-		} // end of for x loop
-		// right column boundary case, mirroring second last column
-		hVals |= (uint8_t)(hVals >> 16);
-		vSums |= (uint8_t)(vSums >> 16);
-
-		// the horizontal sum simplifies to the second last column value
-		uint8_t hSum = (uint8_t)(hVals);
-
-		if (yOdd == 0) {
-			if ((x & 1) == 0) {
-				*(dst++) = hSum;
-				*(dst++) = hVals >> 8;
-				*(dst++) = vSums >> 8;
-			} else {
-				*(dst++) = hVals >> 8;
-				*(dst++) = (hSum + (uint8_t)(vSums >> 8)) >> 1;
-				*(dst++) = vSums;
-			}
-		} else {
-			if ((x & 1) == 0) {
-				*(dst++) = vSums;
-				*(dst++) = (hSum + (uint8_t)(vSums >> 8)) >> 1;
-				*(dst++) = hVals >> 8;
-			} else {
-				*(dst++) = vSums >> 8;
-				*(dst++) = hVals >> 8;
-				*(dst++) = hSum;
-			}
-		}
-
-	} // end of for y loop
-}
-
 static void video_process(freenect_device *dev, uint8_t *pkt, int len)
 {
 	freenect_context *ctx = dev->parent;
@@ -635,7 +356,7 @@ static void video_process(freenect_device *dev, uint8_t *pkt, int len)
 	freenect_frame_mode frame_mode = freenect_get_current_video_mode(dev);
 	switch (dev->video_format) {
 		case FREENECT_VIDEO_RGB:
-			convert_bayer_to_rgb(dev->video.raw_buf, (uint8_t*)dev->video.proc_buf, frame_mode);
+			convert_bayer_to_rgb(dev->video.raw_buf, (uint8_t*)dev->video.proc_buf, frame_mode.width, frame_mode.height);
 			break;
 		case FREENECT_VIDEO_BAYER:
 			break;
@@ -648,7 +369,7 @@ static void video_process(freenect_device *dev, uint8_t *pkt, int len)
 			convert_packed_to_8bit(dev->video.raw_buf, (uint8_t*)dev->video.proc_buf, 10, frame_mode.width * frame_mode.height);
 			break;
 		case FREENECT_VIDEO_YUV_RGB:
-			convert_uyvy_to_rgb(dev->video.raw_buf, (uint8_t*)dev->video.proc_buf, frame_mode);
+			convert_uyvy_to_rgb(dev->video.raw_buf, (uint8_t*)dev->video.proc_buf, frame_mode.width, frame_mode.height);
 			break;
 		case FREENECT_VIDEO_YUV_RAW:
 			break;
diff --git a/src/convert.h b/src/convert.h
new file mode 100644
index 00000000..1c7f9f49
--- /dev/null
+++ b/src/convert.h
@@ -0,0 +1,314 @@
+/*
+ * This file is part of the OpenKinect Project. http://www.openkinect.org
+ *
+ * Copyright (c) 2021 individual OpenKinect contributors. See the CONTRIB file
+ * for details.
+ *
+ * This code is licensed to you under the terms of the Apache License, version
+ * 2.0, or, at your option, the terms of the GNU General Public License,
+ * version 2.0. See the APACHE20 and GPL2 files for the text of the licenses,
+ * or the following URLs:
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * If you redistribute this file in source form, modified or unmodified, you
+ * may:
+ *   1) Leave this header intact and distribute it under the same terms,
+ *      accompanying it with the APACHE20 and GPL20 files, or
+ *   2) Delete the Apache 2.0 clause and accompany it with the GPL2 file, or
+ *   3) Delete the GPL v2 clause and accompany it with the APACHE20 file
+ * In all cases you must keep the copyright notice intact and include a copy
+ * of the CONTRIB file.
+ *
+ * Binary distributions must follow the binary distribution requirements of
+ * either License.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+// Loop-unrolled version of the 11-to-16 bit unpacker.  n must be a multiple of 8.
+static void convert_packed11_to_16bit(uint8_t *raw, uint16_t *frame, int n)
+{
+	uint16_t baseMask = (1 << 11) - 1;
+	while(n >= 8)
+	{
+		uint8_t r0  = *(raw+0);
+		uint8_t r1  = *(raw+1);
+		uint8_t r2  = *(raw+2);
+		uint8_t r3  = *(raw+3);
+		uint8_t r4  = *(raw+4);
+		uint8_t r5  = *(raw+5);
+		uint8_t r6  = *(raw+6);
+		uint8_t r7  = *(raw+7);
+		uint8_t r8  = *(raw+8);
+		uint8_t r9  = *(raw+9);
+		uint8_t r10 = *(raw+10);
+
+		frame[0] =  (r0<<3)  | (r1>>5);
+		frame[1] = ((r1<<6)  | (r2>>2) )           & baseMask;
+		frame[2] = ((r2<<9)  | (r3<<1) | (r4>>7) ) & baseMask;
+		frame[3] = ((r4<<4)  | (r5>>4) )           & baseMask;
+		frame[4] = ((r5<<7)  | (r6>>1) )           & baseMask;
+		frame[5] = ((r6<<10) | (r7<<2) | (r8>>6) ) & baseMask;
+		frame[6] = ((r8<<5)  | (r9>>3) )           & baseMask;
+		frame[7] = ((r9<<8)  | (r10)   )           & baseMask;
+
+		n -= 8;
+		raw += 11;
+		frame += 8;
+	}
+}
+
+/**
+ * Convert a packed array of n elements with vw useful bits into array of
+ * zero-padded 16bit elements.
+ *
+ * @param src The source packed array, of size (n * vw / 8) bytes
+ * @param dest The destination unpacked array, of size (n * 2) bytes
+ * @param vw The virtual width of elements, that is the number of useful bits for each of them
+ * @param n The number of elements (in particular, of the destination array), NOT a length in bytes
+ */
+static void convert_packed_to_16bit(uint8_t *src, uint16_t *dest, int vw, int n)
+{
+	if (vw == 11) {
+		convert_packed11_to_16bit(src, dest, n);
+		return;
+	}
+
+	unsigned int mask = (1 << vw) - 1;
+	uint32_t buffer = 0;
+	int bitsIn = 0;
+	while (n--) {
+		while (bitsIn < vw) {
+			buffer = (buffer << 8) | *(src++);
+			bitsIn += 8;
+		}
+		bitsIn -= vw;
+		*(dest++) = (buffer >> bitsIn) & mask;
+	}
+}
+
+/**
+ * Convert a packed array of n elements with vw useful bits into array of
+ * 8bit elements, dropping LSB.
+ *
+ * @param src The source packed array, of size (n * vw / 8) bytes
+ * @param dest The destination unpacked array, of size (n * 2) bytes
+ * @param vw The virtual width of elements, that is the number of useful bits for each of them
+ * @param n The number of elements (in particular, of the destination array), NOT a length in bytes
+ *
+ * @pre vw is expected to be >= 8.
+ */
+static inline void convert_packed_to_8bit(uint8_t *src, uint8_t *dest, int vw, int n)
+{
+	uint32_t buffer = 0;
+	int bitsIn = 0;
+	while (n--) {
+		while (bitsIn < vw) {
+			buffer = (buffer << 8) | *(src++);
+			bitsIn += 8;
+		}
+		bitsIn -= vw;
+		*(dest++) = buffer >> (bitsIn + vw - 8);
+	}
+}
+
+#define CLAMP(x) if (x < 0) {x = 0;} if (x > 255) {x = 255;}
+static void convert_uyvy_to_rgb(uint8_t *raw_buf, uint8_t *proc_buf, int16_t width, int16_t height)
+{
+	int x, y;
+	for(y = 0; y < height; ++y) {
+		for(x = 0; x < width; x+=2) {
+			int i = (width * y + x);
+			int u  = raw_buf[2*i];
+			int y1 = raw_buf[2*i+1];
+			int v  = raw_buf[2*i+2];
+			int y2 = raw_buf[2*i+3];
+			int r1 = (y1-16)*1164/1000 + (v-128)*1596/1000;
+			int g1 = (y1-16)*1164/1000 - (v-128)*813/1000 - (u-128)*391/1000;
+			int b1 = (y1-16)*1164/1000 + (u-128)*2018/1000;
+			int r2 = (y2-16)*1164/1000 + (v-128)*1596/1000;
+			int g2 = (y2-16)*1164/1000 - (v-128)*813/1000 - (u-128)*391/1000;
+			int b2 = (y2-16)*1164/1000 + (u-128)*2018/1000;
+			CLAMP(r1)
+			CLAMP(g1)
+			CLAMP(b1)
+			CLAMP(r2)
+			CLAMP(g2)
+			CLAMP(b2)
+			proc_buf[3*i]  =r1;
+			proc_buf[3*i+1]=g1;
+			proc_buf[3*i+2]=b1;
+			proc_buf[3*i+3]=r2;
+			proc_buf[3*i+4]=g2;
+			proc_buf[3*i+5]=b2;
+		}
+	}
+}
+#undef CLAMP
+
+static void convert_bayer_to_rgb(uint8_t *raw_buf, uint8_t *proc_buf, int16_t width, int16_t height)
+{
+	int x,y;
+	/* Pixel arrangement:
+	 * G R G R G R G R
+	 * B G B G B G B G
+	 * G R G R G R G R
+	 * B G B G B G B G
+	 * G R G R G R G R
+	 * B G B G B G B G
+	 *
+	 * To convert a Bayer-pattern into RGB you have to handle four pattern
+	 * configurations:
+	 * 1)         2)         3)         4)
+	 *      B1      B1 G1 B2   R1 G1 R2      R1       <- previous line
+	 *   R1 G1 R2   G2 R1 G3   G2 B1 G3   B1 G1 B2    <- current line
+	 *      B2      B3 G4 B4   R3 G4 R4      R2       <- next line
+	 *   ^  ^  ^
+	 *   |  |  next pixel
+	 *   |  current pixel
+	 *   previous pixel
+	 *
+	 * The RGB values (r,g,b) for each configuration are calculated as
+	 * follows:
+	 *
+	 * 1) r = (R1 + R2) / 2
+	 *    g =  G1
+	 *    b = (B1 + B2) / 2
+	 *
+	 * 2) r =  R1
+	 *    g = (G1 + G2 + G3 + G4) / 4
+	 *    b = (B1 + B2 + B3 + B4) / 4
+	 *
+	 * 3) r = (R1 + R2 + R3 + R4) / 4
+	 *    g = (G1 + G2 + G3 + G4) / 4
+	 *    b =  B1
+	 *
+	 * 4) r = (R1 + R2) / 2
+	 *    g =  G1
+	 *    b = (B1 + B2) / 2
+	 *
+	 * To efficiently calculate these values, two 32bit integers are used
+	 * as "shift-buffers". One integer to store the 3 horizontal bayer pixel
+	 * values (previous, current, next) of the current line. The other
+	 * integer to store the vertical average value of the bayer pixels
+	 * (previous, current, next) of the previous and next line.
+	 *
+	 * The boundary conditions for the first and last line and the first
+	 * and last column are solved via mirroring the second and second last
+	 * line and the second and second last column.
+	 *
+	 * To reduce slow memory access, the values of a rgb pixel are packet
+	 * into a 32bit variable and transfered together.
+	 */
+
+	uint8_t *dst = proc_buf; // pointer to destination
+
+	uint8_t *prevLine;        // pointer to previous, current and next line
+	uint8_t *curLine;         // of the source bayer pattern
+	uint8_t *nextLine;
+
+	// storing horizontal values in hVals:
+	// previous << 16, current << 8, next
+	uint32_t hVals;
+	// storing vertical averages in vSums:
+	// previous << 16, current << 8, next
+	uint32_t vSums;
+
+	// init curLine and nextLine pointers
+	curLine  = raw_buf;
+	nextLine = curLine + width;
+	for (y = 0; y < height; ++y) {
+
+		if ((y > 0) && (y < height-1))
+			prevLine = curLine - width; // normal case
+		else if (y == 0)
+			prevLine = nextLine;      // top boundary case
+		else
+			nextLine = prevLine;      // bottom boundary case
+
+		// init horizontal shift-buffer with current value
+		hVals  = (*(curLine++) << 8);
+		// handle left column boundary case
+		hVals |= (*curLine << 16);
+		// init vertical average shift-buffer with current values average
+		vSums = ((*(prevLine++) + *(nextLine++)) << 7) & 0xFF00;
+		// handle left column boundary case
+		vSums |= ((*prevLine + *nextLine) << 15) & 0xFF0000;
+
+		// store if line is odd or not
+		uint8_t yOdd = y & 1;
+		// the right column boundary case is not handled inside this loop
+		// thus the "639"
+		for (x = 0; x < width-1; ++x) {
+			// place next value in shift buffers
+			hVals |= *(curLine++);
+			vSums |= (*(prevLine++) + *(nextLine++)) >> 1;
+
+			// calculate the horizontal sum as this sum is needed in
+			// any configuration
+			uint8_t hSum = ((uint8_t)(hVals >> 16) + (uint8_t)(hVals)) >> 1;
+
+			if (yOdd == 0) {
+				if ((x & 1) == 0) {
+					// Configuration 1
+					*(dst++) = hSum;		// r
+					*(dst++) = hVals >> 8;	// g
+					*(dst++) = vSums >> 8;	// b
+				} else {
+					// Configuration 2
+					*(dst++) = hVals >> 8;
+					*(dst++) = (hSum + (uint8_t)(vSums >> 8)) >> 1;
+					*(dst++) = ((uint8_t)(vSums >> 16) + (uint8_t)(vSums)) >> 1;
+				}
+			} else {
+				if ((x & 1) == 0) {
+					// Configuration 3
+					*(dst++) = ((uint8_t)(vSums >> 16) + (uint8_t)(vSums)) >> 1;
+					*(dst++) = (hSum + (uint8_t)(vSums >> 8)) >> 1;
+					*(dst++) = hVals >> 8;
+				} else {
+					// Configuration 4
+					*(dst++) = vSums >> 8;
+					*(dst++) = hVals >> 8;
+					*(dst++) = hSum;
+				}
+			}
+
+			// shift the shift-buffers
+			hVals <<= 8;
+			vSums <<= 8;
+		} // end of for x loop
+		// right column boundary case, mirroring second last column
+		hVals |= (uint8_t)(hVals >> 16);
+		vSums |= (uint8_t)(vSums >> 16);
+
+		// the horizontal sum simplifies to the second last column value
+		uint8_t hSum = (uint8_t)(hVals);
+
+		if (yOdd == 0) {
+			if ((x & 1) == 0) {
+				*(dst++) = hSum;
+				*(dst++) = hVals >> 8;
+				*(dst++) = vSums >> 8;
+			} else {
+				*(dst++) = hVals >> 8;
+				*(dst++) = (hSum + (uint8_t)(vSums >> 8)) >> 1;
+				*(dst++) = vSums;
+			}
+		} else {
+			if ((x & 1) == 0) {
+				*(dst++) = vSums;
+				*(dst++) = (hSum + (uint8_t)(vSums >> 8)) >> 1;
+				*(dst++) = hVals >> 8;
+			} else {
+				*(dst++) = vSums >> 8;
+				*(dst++) = hVals >> 8;
+				*(dst++) = hSum;
+			}
+		}
+
+	} // end of for y loop
+}

From 5c54a0b2c1ff6be2dfc894dbbfedbe7f754df84d Mon Sep 17 00:00:00 2001
From: Benn Snyder <benn.snyder@gmail.com>
Date: Mon, 3 Jan 2022 16:09:44 -0500
Subject: [PATCH 2/2] feat: expose a function to apply depth registration -
 fixes #649

---
 CMakeLists.txt                     |   6 +-
 examples/camtest.c                 |   8 +++
 fakenect/fakenect.c                |  18 +++---
 include/libfreenect_registration.h |  12 ++++
 src/cameras.c                      |   2 +-
 src/registration.c                 | 100 +++++++++++++++--------------
 src/registration.h                 |   4 +-
 7 files changed, 88 insertions(+), 62 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d8525bab..de8ce711 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,8 +46,8 @@ include (FindOS)
 include (SetupDirectories)
 
 set (PROJECT_VER_MAJOR 0)
-set (PROJECT_VER_MINOR 6)
-set (PROJECT_VER_PATCH 3)
+set (PROJECT_VER_MINOR 7)
+set (PROJECT_VER_PATCH 0)
 set (PROJECT_VER
   "${PROJECT_VER_MAJOR}.${PROJECT_VER_MINOR}.${PROJECT_VER_PATCH}")
 set (PROJECT_APIVER
@@ -101,7 +101,7 @@ SET(DOC_OUTPUT_PATH ${CMAKE_BINARY_DIR}/doc)
 if    (MSVC)
   set(C_FLAGS_WARNING "-W4")
 else  ()
-  set(C_FLAGS_WARNING "-Wall")
+  set(C_FLAGS_WARNING "-Wall -Wno-unused-function")
 endif (MSVC)
 
 set(C_CXX_FLAGS_DEFAULT "${C_FLAGS_WARNING} -O2")
diff --git a/examples/camtest.c b/examples/camtest.c
index 7d796cc1..fc3101d2 100644
--- a/examples/camtest.c
+++ b/examples/camtest.c
@@ -28,15 +28,23 @@
 #include <stdio.h>
 #include <string.h>
 #include "libfreenect.h"
+#include "libfreenect_registration.h"
 
 #ifndef SIGQUIT // win32 compat
 	#define SIGQUIT SIGTERM
 #endif
 
 
+uint16_t mapped_depth[640 * 480];
+
 void depth_cb(freenect_device* dev, void* data, uint32_t timestamp)
 {
 	printf("Received depth frame at %d\n", timestamp);
+
+	int err = freenect_map_depth_to_video(dev, data, mapped_depth);
+	if (err) {
+		printf("Registration error %d\n", err);
+	}
 }
 
 void video_cb(freenect_device* dev, void* data, uint32_t timestamp)
diff --git a/fakenect/fakenect.c b/fakenect/fakenect.c
index 8fee220e..eeb7a4b1 100644
--- a/fakenect/fakenect.c
+++ b/fakenect/fakenect.c
@@ -220,18 +220,20 @@ int freenect_process_events(freenect_context *ctx)
 	   best as we can to match those from the original data and current run
 	   conditions (e.g., if it takes longer to run this code then we wait less).
 	 */
+	int err = 0;
+
 	if (!index_fp)
 		open_index();
 	char type;
 	double record_cur_time;
 	unsigned int timestamp, data_size;
 	char *data = NULL;
-	if (parse_line(&type, &record_cur_time, &timestamp, &data_size, &data)) {
-                if (loop_playback) {
+	err = parse_line(&type, &record_cur_time, &timestamp, &data_size, &data);
+	if (err) {
+		if (loop_playback) {
 			close_index();
 			return 0;
-                } else
-		    return -1;
+		} else return err;
 	}
 	// Sleep an amount that compensates for the original and current delays
 	// playback_ is w.r.t. the current time
@@ -250,9 +252,9 @@ int freenect_process_events(freenect_context *ctx)
 				case FREENECT_DEPTH_11BIT:
 				    memcpy(depth_buffer, cur_depth, mode.bytes);
 				    break;
-                                case FREENECT_DEPTH_REGISTERED:
-                                    freenect_apply_registration(fake_dev, cur_depth, depth_buffer, true);
-                                    break;
+				case FREENECT_DEPTH_REGISTERED:
+					err = freenect_apply_registration(&(fake_dev->registration), freenect_find_video_mode(mode.resolution, FREENECT_DEPTH_11BIT), cur_depth, depth_buffer);
+					break;
 				case FREENECT_DEPTH_MM:
 				    freenect_apply_depth_unpacked_to_mm(fake_dev, cur_depth, depth_buffer);
 				    break;
@@ -303,7 +305,7 @@ int freenect_process_events(freenect_context *ctx)
 	}
 	free(data);
 	playback_prev_time = get_time();
-	return 0;
+	return err;
 }
 
 int freenect_process_events_timeout(freenect_context *ctx, struct timeval *timeout)
diff --git a/include/libfreenect_registration.h b/include/libfreenect_registration.h
index a17a3437..3f877be9 100644
--- a/include/libfreenect_registration.h
+++ b/include/libfreenect_registration.h
@@ -125,6 +125,18 @@ FREENECTAPI void freenect_camera_to_world(freenect_device* dev,
 FREENECTAPI void freenect_map_rgb_to_depth( freenect_device* dev,
 	uint16_t* depth_mm, uint8_t* rgb_raw, uint8_t* rgb_registered );
 
+/**
+ * Converts the depth_frame to output_mm and aligns it with the video frame.
+ * This is similar to using the FREENECT_DEPTH_REGISTERED mode.
+ *
+ * @param dev freenect device
+ * @param depth_frame depth frame buffer
+ * @param output_mm output buffer which must have length at least width * height
+ *
+ * @return 0 on success
+ */
+FREENECTAPI int freenect_map_depth_to_video(freenect_device* dev, void* depth_frame, uint16_t* output_mm);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/cameras.c b/src/cameras.c
index 6ac5bc04..1c4e2a06 100644
--- a/src/cameras.c
+++ b/src/cameras.c
@@ -316,7 +316,7 @@ static void depth_process(freenect_device *dev, uint8_t *pkt, int len)
 			convert_packed_to_16bit(dev->depth.raw_buf, (uint16_t*)dev->depth.proc_buf, 11, 640*480);
 			break;
 		case FREENECT_DEPTH_REGISTERED:
-			freenect_apply_registration(dev, dev->depth.raw_buf, (uint16_t*)dev->depth.proc_buf, false);
+			freenect_apply_registration(&(dev->registration), freenect_find_video_mode(dev->depth_resolution, FREENECT_DEPTH_11BIT_PACKED), dev->depth.raw_buf, (uint16_t*)dev->depth.proc_buf);
 			break;
 		case FREENECT_DEPTH_MM:
 			freenect_apply_depth_to_mm(dev, dev->depth.raw_buf, (uint16_t*)dev->depth.proc_buf );
diff --git a/src/registration.c b/src/registration.c
index cbbf557a..77bf2379 100644
--- a/src/registration.c
+++ b/src/registration.c
@@ -27,6 +27,7 @@
 #include "libfreenect.h"
 #include "freenect_internal.h"
 #include "registration.h"
+#include "convert.h"
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
@@ -73,42 +74,23 @@ static void freenect_init_depth_to_rgb(int32_t* depth_to_rgb, freenect_zero_plan
 	}
 }
 
-// unrolled inner loop of the 11-bit unpacker
-static inline void unpack_8_pixels(uint8_t *raw, uint16_t *frame)
+// apply registration data to a single frame
+FN_INTERNAL int freenect_apply_registration(const freenect_registration* reg, const freenect_frame_mode input_mode, void* input, uint16_t* output_mm)
 {
-	uint16_t baseMask = 0x7FF;
-
-	uint8_t r0  = *(raw+0);
-	uint8_t r1  = *(raw+1);
-	uint8_t r2  = *(raw+2);
-	uint8_t r3  = *(raw+3);
-	uint8_t r4  = *(raw+4);
-	uint8_t r5  = *(raw+5);
-	uint8_t r6  = *(raw+6);
-	uint8_t r7  = *(raw+7);
-	uint8_t r8  = *(raw+8);
-	uint8_t r9  = *(raw+9);
-	uint8_t r10 = *(raw+10);
-
-	frame[0] =  (r0<<3)  | (r1>>5);
-	frame[1] = ((r1<<6)  | (r2>>2) )           & baseMask;
-	frame[2] = ((r2<<9)  | (r3<<1) | (r4>>7) ) & baseMask;
-	frame[3] = ((r4<<4)  | (r5>>4) )           & baseMask;
-	frame[4] = ((r5<<7)  | (r6>>1) )           & baseMask;
-	frame[5] = ((r6<<10) | (r7<<2) | (r8>>6) ) & baseMask;
-	frame[6] = ((r8<<5)  | (r9>>3) )           & baseMask;
-	frame[7] = ((r9<<8)  | (r10)   )           & baseMask;
-}
+	if (!reg || !input || !output_mm) return -1;
+	if (!input_mode.is_valid) return -1;
+	if (input_mode.resolution != FREENECT_RESOLUTION_MEDIUM) return -2;
+
+	if (input_mode.depth_format == FREENECT_DEPTH_REGISTERED) {
+		memcpy(output_mm, input, input_mode.bytes);
+		return 0;
+	}
 
-// apply registration data to a single packed frame
-FN_INTERNAL int freenect_apply_registration(freenect_device* dev, uint8_t* input, uint16_t* output_mm, bool unpacked)
-{
-	freenect_registration* reg = &(dev->registration);
 	// set output buffer to zero using pointer-sized memory access (~ 30-40% faster than memset)
 	size_t i, *wipe = (size_t*)output_mm;
 	for (i = 0; i < DEPTH_X_RES * DEPTH_Y_RES * sizeof(uint16_t) / sizeof(size_t); i++) wipe[i] = DEPTH_NO_MM_VALUE;
 
-	uint16_t unpack[8];
+	uint16_t unpack[8] = { 0 };
 
 	uint32_t target_offset = DEPTH_Y_RES * reg->reg_pad_info.start_lines;
 	uint32_t x,y,source_index = 8;
@@ -116,22 +98,32 @@ FN_INTERNAL int freenect_apply_registration(freenect_device* dev, uint8_t* input
 	for (y = 0; y < DEPTH_Y_RES; y++) {
 		for (x = 0; x < DEPTH_X_RES; x++) {
 
-                        uint16_t metric_depth;
-
-                        if (unpacked) {
-                                uint32_t buf_index = y * DEPTH_X_RES + x;
-                                metric_depth = reg->raw_to_mm_shift[((uint16_t *)input)[buf_index]];
-                        } else {
-                                // get 8 pixels from the packed frame
-                                if (source_index == 8) {
-                                        unpack_8_pixels( input, unpack );
-                                        source_index = 0;
-                                        input += 11;
-                                }
-
-                                // get the value at the current depth pixel, convert to millimeters
-                                metric_depth = reg->raw_to_mm_shift[ unpack[source_index++] ];
-                        }
+			uint16_t metric_depth;
+
+			switch (input_mode.depth_format) {
+				case FREENECT_DEPTH_MM:
+					uint32_t mm_index = (y * input_mode.width) + x;
+					metric_depth = ((uint16_t *)input)[mm_index];
+					break;
+				case FREENECT_DEPTH_11BIT: // as used by fakenect-record
+				case FREENECT_DEPTH_10BIT: // todo: does this work?
+					uint32_t buf_index = (y * input_mode.width) + x;
+					metric_depth = reg->raw_to_mm_shift[((uint16_t *)input)[buf_index]];
+					break;
+				case FREENECT_DEPTH_11BIT_PACKED:
+				case FREENECT_DEPTH_10BIT_PACKED:
+					// get 8 pixels from the packed frame
+					if (source_index == 8) {
+						convert_packed_to_16bit(input, unpack, input_mode.data_bits_per_pixel, 8);
+						source_index = 0;
+						input += 11;
+					}
+					// get the value at the current depth pixel, convert to millimeters
+					metric_depth = reg->raw_to_mm_shift[ unpack[source_index++] ];
+					break;
+				default:
+					return -99;
+			}
 
 			// so long as the current pixel has a depth value
 			if (metric_depth == DEPTH_NO_MM_VALUE) continue;
@@ -175,17 +167,29 @@ FN_INTERNAL int freenect_apply_registration(freenect_device* dev, uint8_t* input
 	return 0;
 }
 
+FREENECTAPI int freenect_map_depth_to_video(freenect_device* dev, void* input, uint16_t* output_mm)
+{
+	if (!dev || !input || !output_mm) return -1;
+
+	if (!dev->registration.registration_table) {
+		freenect_init_registration(dev);
+	}
+	const freenect_registration* reg = &(dev->registration);
+	const freenect_frame_mode depth_mode = freenect_get_current_depth_mode(dev);
+	return freenect_apply_registration(reg, depth_mode, input, output_mm);
+}
+
 // Same as freenect_apply_registration, but don't bother aligning to the RGB image
 FN_INTERNAL int freenect_apply_depth_to_mm(freenect_device* dev, uint8_t* input_packed, uint16_t* output_mm)
 {
 	freenect_registration* reg = &(dev->registration);
-	uint16_t unpack[8];
+	uint16_t unpack[8] = { 0 };
 	uint32_t x,y,source_index = 8;
 	for (y = 0; y < DEPTH_Y_RES; y++) {
 		for (x = 0; x < DEPTH_X_RES; x++) {
 			// get 8 pixels from the packed frame
 			if (source_index == 8) {
-				unpack_8_pixels( input_packed, unpack );
+				convert_packed_to_16bit(input_packed, output_mm, 11, 8);
 				source_index = 0;
 				input_packed += 11;
 			}
diff --git a/src/registration.h b/src/registration.h
index a2a044e2..249f2951 100644
--- a/src/registration.h
+++ b/src/registration.h
@@ -26,11 +26,11 @@
 
 #pragma once
 
-#include <stdbool.h>
+#include <stdint.h>
 #include "libfreenect.h"
 
 // Internal function declarations relating to registration
 int freenect_init_registration(freenect_device* dev);
-int freenect_apply_registration(freenect_device* dev, uint8_t* input, uint16_t* output_mm, bool unpacked);
+int freenect_apply_registration(const freenect_registration* reg, const freenect_frame_mode depth_mode, void* depth_frame, uint16_t* output_mm);
 int freenect_apply_depth_to_mm(freenect_device* dev, uint8_t* input_packed, uint16_t* output_mm);
 int freenect_apply_depth_unpacked_to_mm(freenect_device* dev, uint16_t* input, uint16_t* output_mm);