[PATCH v2] libcamera: debayer_cpu: Add 32bits/aligned output formats

Tue Jun 18 01:10:39 CEST 2024

Hi Robert,

Thank you for the patch.

On Mon, Jun 17, 2024 at 07:41:19PM +0200, Robert Mader wrote:
> In order to be more compatible with modern hardware and APIs. This
> notably allows GL implementations to directly import the buffers more
> often and seems to be required for Wayland.
> 
> Further more, as we already enforce a 8 byte stride, these formats work
> better for clients that don't support padding - such as libwebrtc at the
> time of writing.
> 
> Tested devices:
>  - Librem5
>  - PinePhone
>  - Thinkpad X13s
> 
> Signed-off-by: Robert Mader <robert.mader at collabora.com>
> Tested-by: Kieran Bingham <kieran.bingham at ideasonboard.com>
> ---
>  src/libcamera/software_isp/debayer_cpu.cpp | 101 ++++++++++++++-------
>  src/libcamera/software_isp/debayer_cpu.h   |   1 +
>  2 files changed, 68 insertions(+), 34 deletions(-)
> 
> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
> index c038eed4..79bb4d87 100644
> --- a/src/libcamera/software_isp/debayer_cpu.cpp
> +++ b/src/libcamera/software_isp/debayer_cpu.cpp
> @@ -70,10 +70,11 @@ DebayerCpu::~DebayerCpu()
>   * GBG
>   * RGR
>   */
> -#define BGGR_BGR888(p, n, div)                                                                \
> +#define BGGR_BGR888(p, n, div, addAlphaBit)                                                   \
>  	*dst++ = blue_[curr[x] / (div)];                                                      \
>  	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];       \
>  	*dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
> +	if (addAlphaBit) *dst++ = 255;                                                        \

A conditional in here will slow everything down very significantly, not
just for the new 32-bit formats, but for the existing 24-bit formats. I
don't expect Milan and Hans to be very happy about it. We need a way to
get this conditional optimized out at compile time.

Function templates could help avoiding the duplication of the macros
while guaranteeing compile-time optimization.

>  	x++;
>  
>  /*
> @@ -81,10 +82,11 @@ DebayerCpu::~DebayerCpu()
>   * RGR
>   * GBG
>   */
> -#define GRBG_BGR888(p, n, div)                                    \
> +#define GRBG_BGR888(p, n, div, addAlphaBit)                       \
>  	*dst++ = blue_[(prev[x] + next[x]) / (2 * (div))];        \
>  	*dst++ = green_[curr[x] / (div)];                         \
>  	*dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
> +	if (addAlphaBit) *dst++ = 255;                            \
>  	x++;
>  
>  /*
> @@ -92,10 +94,11 @@ DebayerCpu::~DebayerCpu()
>   * BGB
>   * GRG
>   */
> -#define GBRG_BGR888(p, n, div)                                     \
> +#define GBRG_BGR888(p, n, div, addAlphaBit)                        \
>  	*dst++ = blue_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
>  	*dst++ = green_[curr[x] / (div)];                          \
>  	*dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
> +	if (addAlphaBit) *dst++ = 255;                             \
>  	x++;
>  
>  /*
> @@ -103,10 +106,11 @@ DebayerCpu::~DebayerCpu()
>   * GRG
>   * BGB
>   */
> -#define RGGB_BGR888(p, n, div)                                                                 \
> +#define RGGB_BGR888(p, n, div, addAlphaBit)                                                    \
>  	*dst++ = blue_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
>  	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];        \
>  	*dst++ = red_[curr[x] / (div)];                                                        \
> +	if (addAlphaBit) *dst++ = 255;                                                         \
>  	x++;
>  
>  void DebayerCpu::debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
> @@ -114,8 +118,8 @@ void DebayerCpu::debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  	DECLARE_SRC_POINTERS(uint8_t)
>  
>  	for (int x = 0; x < (int)window_.width;) {
> -		BGGR_BGR888(1, 1, 1)
> -		GBRG_BGR888(1, 1, 1)
> +		BGGR_BGR888(1, 1, 1, addAlphaBit_)
> +		GBRG_BGR888(1, 1, 1, addAlphaBit_)
>  	}
>  }
>  
> @@ -124,8 +128,8 @@ void DebayerCpu::debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  	DECLARE_SRC_POINTERS(uint8_t)
>  
>  	for (int x = 0; x < (int)window_.width;) {
> -		GRBG_BGR888(1, 1, 1)
> -		RGGB_BGR888(1, 1, 1)
> +		GRBG_BGR888(1, 1, 1, addAlphaBit_)
> +		RGGB_BGR888(1, 1, 1, addAlphaBit_)
>  	}
>  }
>  
> @@ -135,8 +139,8 @@ void DebayerCpu::debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  
>  	for (int x = 0; x < (int)window_.width;) {
>  		/* divide values by 4 for 10 -> 8 bpp value */
> -		BGGR_BGR888(1, 1, 4)
> -		GBRG_BGR888(1, 1, 4)
> +		BGGR_BGR888(1, 1, 4, addAlphaBit_)
> +		GBRG_BGR888(1, 1, 4, addAlphaBit_)
>  	}
>  }
>  
> @@ -146,8 +150,8 @@ void DebayerCpu::debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  
>  	for (int x = 0; x < (int)window_.width;) {
>  		/* divide values by 4 for 10 -> 8 bpp value */
> -		GRBG_BGR888(1, 1, 4)
> -		RGGB_BGR888(1, 1, 4)
> +		GRBG_BGR888(1, 1, 4, addAlphaBit_)
> +		RGGB_BGR888(1, 1, 4, addAlphaBit_)
>  	}
>  }
>  
> @@ -157,8 +161,8 @@ void DebayerCpu::debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  
>  	for (int x = 0; x < (int)window_.width;) {
>  		/* divide values by 16 for 12 -> 8 bpp value */
> -		BGGR_BGR888(1, 1, 16)
> -		GBRG_BGR888(1, 1, 16)
> +		BGGR_BGR888(1, 1, 16, addAlphaBit_)
> +		GBRG_BGR888(1, 1, 16, addAlphaBit_)
>  	}
>  }
>  
> @@ -168,8 +172,8 @@ void DebayerCpu::debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  
>  	for (int x = 0; x < (int)window_.width;) {
>  		/* divide values by 16 for 12 -> 8 bpp value */
> -		GRBG_BGR888(1, 1, 16)
> -		RGGB_BGR888(1, 1, 16)
> +		GRBG_BGR888(1, 1, 16, addAlphaBit_)
> +		RGGB_BGR888(1, 1, 16, addAlphaBit_)
>  	}
>  }
>  
> @@ -187,12 +191,12 @@ void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  	 */
>  	for (int x = 0; x < widthInBytes;) {
>  		/* First pixel */
> -		BGGR_BGR888(2, 1, 1)
> +		BGGR_BGR888(2, 1, 1, addAlphaBit_)
>  		/* Second pixel BGGR -> GBRG */
> -		GBRG_BGR888(1, 1, 1)
> +		GBRG_BGR888(1, 1, 1, addAlphaBit_)
>  		/* Same thing for third and fourth pixels */
> -		BGGR_BGR888(1, 1, 1)
> -		GBRG_BGR888(1, 2, 1)
> +		BGGR_BGR888(1, 1, 1, addAlphaBit_)
> +		GBRG_BGR888(1, 2, 1, addAlphaBit_)
>  		/* Skip 5th src byte with 4 x 2 least-significant-bits */
>  		x++;
>  	}
> @@ -207,12 +211,12 @@ void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  
>  	for (int x = 0; x < widthInBytes;) {
>  		/* First pixel */
> -		GRBG_BGR888(2, 1, 1)
> +		GRBG_BGR888(2, 1, 1, addAlphaBit_)
>  		/* Second pixel GRBG -> RGGB */
> -		RGGB_BGR888(1, 1, 1)
> +		RGGB_BGR888(1, 1, 1, addAlphaBit_)
>  		/* Same thing for third and fourth pixels */
> -		GRBG_BGR888(1, 1, 1)
> -		RGGB_BGR888(1, 2, 1)
> +		GRBG_BGR888(1, 1, 1, addAlphaBit_)
> +		RGGB_BGR888(1, 2, 1, addAlphaBit_)
>  		/* Skip 5th src byte with 4 x 2 least-significant-bits */
>  		x++;
>  	}
> @@ -227,12 +231,12 @@ void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
>  
>  	for (int x = 0; x < widthInBytes;) {
>  		/* Even pixel */
> -		GBRG_BGR888(2, 1, 1)
> +		GBRG_BGR888(2, 1, 1, addAlphaBit_)
>  		/* Odd pixel GBGR -> BGGR */
> -		BGGR_BGR888(1, 1, 1)
> +		BGGR_BGR888(1, 1, 1, addAlphaBit_)
>  		/* Same thing for next 2 pixels */
> -		GBRG_BGR888(1, 1, 1)
> -		BGGR_BGR888(1, 2, 1)
> +		GBRG_BGR888(1, 1, 1, addAlphaBit_)
> +		BGGR_BGR888(1, 2, 1, addAlphaBit_)
>  		/* Skip 5th src byte with 4 x 2 least-significant-bits */
>  		x++;
>  	}
> @@ -247,12 +251,12 @@ void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
>  
>  	for (int x = 0; x < widthInBytes;) {
>  		/* Even pixel */
> -		RGGB_BGR888(2, 1, 1)
> +		RGGB_BGR888(2, 1, 1, addAlphaBit_)
>  		/* Odd pixel RGGB -> GRBG */
> -		GRBG_BGR888(1, 1, 1)
> +		GRBG_BGR888(1, 1, 1, addAlphaBit_)
>  		/* Same thing for next 2 pixels */
> -		RGGB_BGR888(1, 1, 1)
> -		GRBG_BGR888(1, 2, 1)
> +		RGGB_BGR888(1, 1, 1, addAlphaBit_)
> +		GRBG_BGR888(1, 2, 1, addAlphaBit_)
>  		/* Skip 5th src byte with 4 x 2 least-significant-bits */
>  		x++;
>  	}
> @@ -280,7 +284,14 @@ int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &conf
>  		config.bpp = (bayerFormat.bitDepth + 7) & ~7;
>  		config.patternSize.width = 2;
>  		config.patternSize.height = 2;
> -		config.outputFormats = std::vector<PixelFormat>({ formats::RGB888, formats::BGR888 });
> +		config.outputFormats = std::vector<PixelFormat>({
> +			formats::RGB888,
> +			formats::XRGB8888,
> +			formats::ARGB8888,
> +			formats::BGR888,
> +			formats::XBGR8888,
> +			formats::ABGR8888
> +		});
>  		return 0;
>  	}
>  
> @@ -290,7 +301,14 @@ int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &conf
>  		config.bpp = 10;
>  		config.patternSize.width = 4; /* 5 bytes per *4* pixels */
>  		config.patternSize.height = 2;
> -		config.outputFormats = std::vector<PixelFormat>({ formats::RGB888, formats::BGR888 });
> +		config.outputFormats = std::vector<PixelFormat>({
> +			formats::RGB888,
> +			formats::XRGB8888,
> +			formats::ARGB8888,
> +			formats::BGR888,
> +			formats::XBGR8888,
> +			formats::ABGR8888
> +		});
>  		return 0;
>  	}
>  
> @@ -306,6 +324,12 @@ int DebayerCpu::getOutputConfig(PixelFormat outputFormat, DebayerOutputConfig &c
>  		return 0;
>  	}
>  
> +	if (outputFormat == formats::XRGB8888 || outputFormat == formats::ARGB8888 ||
> +	    outputFormat == formats::XBGR8888 || outputFormat == formats::ABGR8888) {
> +		config.bpp = 32;
> +		return 0;
> +	}
> +
>  	LOG(Debayer, Info)
>  		<< "Unsupported output format " << outputFormat.toString();
>  	return -EINVAL;
> @@ -344,6 +368,7 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>  
>  	xShift_ = 0;
>  	swapRedBlueGains_ = false;
> +	addAlphaBit_ = false;
>  
>  	auto invalidFmt = []() -> int {
>  		LOG(Debayer, Error) << "Unsupported input output format combination";
> @@ -351,8 +376,16 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>  	};
>  
>  	switch (outputFormat) {
> +	case formats::XRGB8888:
> +	case formats::ARGB8888:
> +	  addAlphaBit_ = true;
> +	  [[fallthrough]];
>  	case formats::RGB888:
>  		break;
> +	case formats::XBGR8888:
> +	case formats::ABGR8888:
> +	  addAlphaBit_ = true;
> +	  [[fallthrough]];
>  	case formats::BGR888:
>  		/* Swap R and B in bayer order to generate BGR888 instead of RGB888 */
>  		swapRedBlueGains_ = true;
> diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
> index be7dcdca..4f77600d 100644
> --- a/src/libcamera/software_isp/debayer_cpu.h
> +++ b/src/libcamera/software_isp/debayer_cpu.h
> @@ -132,6 +132,7 @@ private:
>  	debayerFn debayer1_;
>  	debayerFn debayer2_;
>  	debayerFn debayer3_;
> +	bool addAlphaBit_;
>  	Rectangle window_;
>  	DebayerInputConfig inputConfig_;
>  	DebayerOutputConfig outputConfig_;

-- 
Regards,

Laurent Pinchart