Example: RGB deinterleaving

Consider a 24-bit RGB image where the image is an array of pixels, each with a red, blue, and green element. In memory this could appear as:

This is because the RGB data is interleaved, accessing and manipulating the three separate color channels presents a problem to the programmer. In simple circumstances we could write our own single color channel operations by applying the “modulo 3” to the interleaved RGB values. However, for more complex operations, such as Fourier transforms, it would make more sense to extract and split the channels.

We have an array of RGB values in memory and we want to deinterleave them and place the values in separate color arrays. A C procedure to do this might look like this:

void rgb_deinterleave_c(uint8_t *r, uint8_t *g, uint8_t *b, uint8_t *rgb, int len_color) {
    /*
     * Take the elements of "rgb" and store the individual colors "r", "g", and "b".
     */
    for (int i=0; i < len_color; i++) {
        r[i] = rgb[3*i];
        g[i] = rgb[3*i+1];
        b[i] = rgb[3*i+2];
    }
}

But there is an issue. Compiling with Arm Compiler 6 at optimization level -O3 (very high optimization) and examining the disassembly shows no Neon instructions or registers are being used. Each individual 8-bit value is stored in a separate 64-bit general registers. Considering the full width Neon registers are 128 bits wide, which could each hold 16 of our 8-bit values in the example, re-writing the solution to use Neon intrinsics should give us good results.

void rgb_deinterleave_neon(uint8_t *r, uint8_t *g, uint8_t *b, uint8_t *rgb, int len_color) {
    /*
     * Take the elements of "rgb" and store the individual colors "r", "g", and "b"
     */
    int num8x16 = len_color / 16;
    uint8x16x3_t intlv_rgb;
    for (int i=0; i < num8x16; i++) {
        intlv_rgb = vld3q_u8(rgb+3*16*i);
        vst1q_u8(r+16*i, intlv_rgb.val[0]);
        vst1q_u8(g+16*i, intlv_rgb.val[1]);
        vst1q_u8(b+16*i, intlv_rgb.val[2]);
    }
}

In this example we have used the following types and intrinsics:

Code element What is it? Why are we using it?
uint8x16_t An array of 16 8-bit unsigned integers. One uint8x16_t fits into a 128-bit register. We can ensure there are no wasted register bits even in C code.
uint8x16x3_t A struct with three uint8x16_t elements. A temporary holding area for the current color values in the loop.
vld3q_u8(…) A function which returns a uint8x16x3_t by loading a contiguous region of 3*16 bytes of memory. Each byte loaded is placed one of the three uint8x16_t arrays in an alternating pattern. At the lowest level, this intrinsic guarantees the generation of an LD3 instruction, which loads the values from a given address into three Neon registers in an alternating pattern.
vst1q_u8(…) A function which stores a uint8x16_t at a given address. It stores a full 128-bit register full of byte values.
  • Full source code example: RGB deinterleaving
    /*
     * Copyright (C) Arm Limited, 2019 All rights reserved. 
     * 
     * The example code is provided to you as an aid to learning when working 
     * with Arm-based technology, including but not limited to programming tutorials. 
     * Arm hereby grants to you, subject to the terms and conditions of this Licence, 
     * a non-exclusive, non-transferable, non-sub-licensable, free-of-charge licence, 
     * to use and copy the Software solely for the purpose of demonstration and 
     * evaluation.
     * 
     * You accept that the Software has not been tested by Arm therefore the Software 
     * is provided "as is", without warranty of any kind, express or implied. In no 
     * event shall the authors or copyright holders be liable for any claim, damages 
     * or other liability, whether in action or contract, tort or otherwise, arising 
     * from, out of or in connection with the Software or the use of Software.
     */
    
    #include <stdio.h>
    #include <stdint.h>
    
    #include <arm_neon.h>
    
    
    void init_rgb(uint8_t *rgb, int num_rgbpix) {
    	for (int i=0; i<num_rgbpix; i++) {
    		rgb[i] = i % 3;
    	}
    }
    
    void print_pixvals(uint8_t *array, int num_rgbpix) {
    	for (int i=0; i<num_rgbpix; i++) {
    		printf("%d,", array[i]);
    		if (i % 3 == 2) {
    			printf(" ");
    		}
    	}
    	printf("\n");
    }
    
    void rgb_deinterleave_neon(uint8_t *r, uint8_t *g, uint8_t *b, uint8_t *rgb, int len_colour) {
    	/*
    	 * Take the elements of "rgb" and store the individual colours "r", "g", and "b"
    	 */
    	int num8x16 = len_colour/16;
    	uint8x16x3_t intlv_rgb;
    	for (int i=0; i<num8x16; i++) {
    		intlv_rgb = vld3q_u8(rgb+3*16*i);
    		vst1q_u8(r+16*i, intlv_rgb.val[0]);
    		vst1q_u8(g+16*i, intlv_rgb.val[1]);
    		vst1q_u8(b+16*i, intlv_rgb.val[2]);
    	}
    }
    
    void rgb_reinterleave_neon(uint8_t *r, uint8_t *g, uint8_t *b, uint8_t *rgb, int len_colour) {
    	/*
    	 * Take the elements of "r", "g", and "b" and store modulo 3 alternating colour values in "rgb"
    	 */
    	int num8x16 = len_colour/16;
    	uint8x16x3_t intlv_rgb;
    	for (int i=0; i<num8x16; i++) {
    		intlv_rgb.val[0] = vld1q_u8(r+16*i);
    		intlv_rgb.val[1] = vld1q_u8(g+16*i);
    		intlv_rgb.val[2] = vld1q_u8(b+16*i);
    		vst3q_u8(rgb+3*16*i, intlv_rgb);
    	}
    	
    }
    
    void add_channels_neon(uint8_t *x, uint8_t *y, uint8_t *z, int len_colour) {
    	/*
    	 * add "y" to "z" and store in "x".
    	 */
    	int num8x16 = len_colour/16;
    	uint8x16_t q_8b_x;
    	uint8x16_t q_8b_y;
    	uint8x16_t q_8b_z;
    	for (int i=0; i<num8x16; i++) {
    		q_8b_y = vld1q_u8(y+16*i);
    		q_8b_z = vld1q_u8(z+16*i);
    		q_8b_x = vaddq_u8(q_8b_y, q_8b_z);
    		vst1q_u8(x+16*i, q_8b_x);
    	}
    }
    
    void rgb_deinterleave_c(uint8_t *r, uint8_t *g, uint8_t *b, uint8_t *rgb, int len_colour) {
    	/*
    	 * Take the elements of "rgb" and store the individual colours "r", "g", and "b".
    	 */
    	for (int i=0; i<len_colour; i++) {
    		r[i] = rgb[3*i];
    		g[i] = rgb[3*i+1];
    		b[i] = rgb[3*i+2];
    	}
    }
    
    void rgb_reinterleave_c(uint8_t *r, uint8_t *g, uint8_t *b, uint8_t *rgb, int len_colours) {
    	/*
    	 * Take the elements of "r", "g", and "b" and store modulo 3 alternating colour values in "rgb"
    	 */
    	for (int i=0; i<len_colours; i++) {
    		rgb[3*i] = r[i];
    		rgb[3*i+1] = g[i];
    		rgb[3*i+2] = b[i];
    	}
    }
    
    void add_channels_c(uint8_t *rgb, int num_rgbpix) {
    	for (int i=0; i<num_rgbpix; i++) {
    		if (i % 3 == 0) {
    			rgb[i] = rgb[i+1] + rgb[i+2];
    		}
    	}
    }
    
    void print_channel_c(uint8_t *arr, int num_vals) {
    	for (int j=0; j<num_vals; j++) {
    		printf("%d,", arr[j]);
    	}
    	printf("\n");
    }
    
    int main() {
    	int num_rgbpix = 3*16*1;
    	int len_colours = num_rgbpix/3;
    	uint8_t rgb[num_rgbpix];
    	
    	uint8_t r[len_colours];
    	uint8_t g[len_colours];
    	uint8_t b[len_colours];
    	
    	init_rgb(rgb, num_rgbpix);
    	print_pixvals(rgb, num_rgbpix);
    	
    	printf("============ C    ===========\n");
    	rgb_deinterleave_c(r, g, b, rgb, len_colours);
    	print_channel_c(r, len_colours);
    	print_channel_c(g, len_colours);
    	print_channel_c(b, len_colours);
    
    
    	printf("============ Neon ===========\n");
    	
    	rgb_deinterleave_neon(r, g, b, rgb, len_colours);
    	
    	print_channel_c(r, len_colours);
    	print_channel_c(g, len_colours);
    	print_channel_c(b, len_colours);
    
    	add_channels_neon(r, g, b, len_colours);
    	rgb_reinterleave_neon(r, g, b, rgb, len_colours);
    	//add_channels_c(rgb, num_rgbpix);
    	
    	print_pixvals(rgb, num_rgbpix);
    }
    
Previous Next