/**
 *    @file neon_example.c
 *    @version 1.0.0
 *  
 *    @brief This example shows sample NEON assembly code. NEON is a SIMD (single instruction multiple data) instruction set of i.MX6' ARM CPUs.
 *  
 *    BLT_DISCLAIMER
 *  
 *    @author Harald Krapfenbauer
 */


#include <stdint.h>
#include <assert.h>
#include <stdio.h>


int neonCreateAmplitudeMask(uint16_t *amplitudes, uint16_t xRes, uint16_t yRes, uint16_t threshold, uint16_t *outMask)
{
	/* We calculate 16 values per loop iteration, so xRes must be a multiple of 16 */
	assert(xRes % 16 == 0);

	/* Load vector with threshold into NEON registers
	   Only done once per function call! */
	uint16_t thresholdVec[16];
	for (int i=0; i<16; i++) {
		thresholdVec[i] = threshold;
	}
	__asm__ volatile(
		/* These are NEON assembly instructions */
		"vld4.16 {d0, d1, d2, d3}, [%[thresIn]] \n"
		: /* output operands */
		: /* input operands */
		  [thresIn] "r" (thresholdVec)
		: /* clobber list */
		  "d0", "d1", "d2", "d3");

	/* Calculate 16 values in one run */
	uint16_t *ampIn = amplitudes;
	uint16_t *maskOut = outMask;
	for (int i = 0; i < xRes * yRes / 16; i++) {
		/* These are NEON assembly instructions */
		__asm__ volatile(
			"vld4.16 {d4, d5, d6, d7}, [%[ampIn]]! \n" /* load amplitude, each D-reg is 64 bits wide, so we are loading 4*4=16 elements */
			"vcgt.u16 q4, q2, q0 \n" /* compare Q2 reg (=D4 and D5) to Q0 reg (=D0 and D1) and store result in Q4 reg (=D8 and D9) */
			"vcgt.u16 q5, q3, q1 \n" /* do the same for other half of values */
			"vst4.16 {d8, d9, d10, d11}, [%[maskOut]]! \n" /* Store to memory */
			: /* output operands */
			  [ampIn] "+r" (ampIn), /* Note, 'ampIn' is automatically incremented by NEON instruction, so it is a read/write variable, which counts as output operand! */
			  [maskOut] "+r" (maskOut)
			: /* input operands */
			: /* clobber list */
			  "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "memory");
	}
		
	return 0;
}
