Initial commit
This commit is contained in:
		
							
								
								
									
										448
									
								
								Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										448
									
								
								Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,448 @@ | ||||
| /* | ||||
|  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. | ||||
|  * | ||||
|  * SPDX-License-Identifier: Apache-2.0 | ||||
|  * | ||||
|  * Licensed under the Apache License, Version 2.0 (the License); you may | ||||
|  * not use this file except in compliance with the License. | ||||
|  * You may obtain a copy of the License at | ||||
|  * | ||||
|  * www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an AS IS BASIS, WITHOUT | ||||
|  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
|  | ||||
| /* ---------------------------------------------------------------------- | ||||
|  * Project:      CMSIS NN Library | ||||
|  * Title:        arm_pool_q7_HWC.c | ||||
|  * Description:  Pooling function implementations | ||||
|  * | ||||
|  * $Date:        17. January 2018 | ||||
|  * $Revision:    V.1.0.0 | ||||
|  * | ||||
|  * Target Processor:  Cortex-M cores | ||||
|  * | ||||
|  * -------------------------------------------------------------------- */ | ||||
|  | ||||
| #include "arm_math.h" | ||||
| #include "arm_nnfunctions.h" | ||||
|  | ||||
| #if defined (ARM_MATH_DSP) | ||||
|  | ||||
| /** | ||||
|  * @brief A few utility functions used by pooling functions | ||||
|  * | ||||
|  *  | ||||
|  */ | ||||
|  | ||||
| static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t length, uint16_t scale) | ||||
| { | ||||
|     int       i; | ||||
|  | ||||
|     for (i = 0; i < length; i++) | ||||
|     { | ||||
|         target[i] = (q7_t) (buffer[i] / scale); | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void compare_and_replace_if_larger_q7(q7_t * base,   // base data | ||||
|                                              q7_t * target, // compare target | ||||
|                                              const uint16_t length  // data length | ||||
|     ) | ||||
| { | ||||
|     q7_t     *pIn = base; | ||||
|     q7_t     *pCom = target; | ||||
|     union arm_nnword in; | ||||
|     union arm_nnword com; | ||||
|     uint16_t  cnt = length >> 2; | ||||
|  | ||||
|     while (cnt > 0u) | ||||
|     { | ||||
|         in.word = *__SIMD32(pIn); | ||||
|         com.word = *__SIMD32(pCom)++; | ||||
|  | ||||
|         // if version | ||||
|         if (com.bytes[0] > in.bytes[0]) | ||||
|             in.bytes[0] = com.bytes[0]; | ||||
|         if (com.bytes[1] > in.bytes[1]) | ||||
|             in.bytes[1] = com.bytes[1]; | ||||
|         if (com.bytes[2] > in.bytes[2]) | ||||
|             in.bytes[2] = com.bytes[2]; | ||||
|         if (com.bytes[3] > in.bytes[3]) | ||||
|             in.bytes[3] = com.bytes[3]; | ||||
|  | ||||
|         *__SIMD32(pIn)++ = in.word; | ||||
|  | ||||
|         cnt--; | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void accumulate_q7_to_q15(q15_t * base, q7_t * target, const uint16_t length) | ||||
| { | ||||
|     q15_t    *pCnt = base; | ||||
|     q7_t     *pV = target; | ||||
|     q31_t     v1, v2, vo1, vo2; | ||||
|     uint16_t  cnt = length >> 2; | ||||
|     q31_t     in; | ||||
|  | ||||
|     while (cnt > 0u) | ||||
|     { | ||||
|         q31_t     value = *__SIMD32(pV)++; | ||||
|         v1 = __SXTB16(__ROR(value, 8)); | ||||
|         v2 = __SXTB16(value); | ||||
| #ifndef ARM_MATH_BIG_ENDIAN | ||||
|  | ||||
|         vo2 = __PKHTB(v1, v2, 16); | ||||
|         vo1 = __PKHBT(v2, v1, 16); | ||||
|  | ||||
| #else | ||||
|  | ||||
|         vo1 = __PKHTB(v1, v2, 16); | ||||
|         vo2 = __PKHBT(v2, v1, 16); | ||||
|  | ||||
| #endif | ||||
|  | ||||
|         in = *__SIMD32(pCnt); | ||||
|         *__SIMD32(pCnt)++ = __QADD16(vo1, in); | ||||
|  | ||||
|         in = *__SIMD32(pCnt); | ||||
|         *__SIMD32(pCnt)++ = __QADD16(vo2, in); | ||||
|  | ||||
|         cnt--; | ||||
|     } | ||||
|     cnt = length & 0x3; | ||||
|     while (cnt > 0u) | ||||
|     { | ||||
|         *pCnt++ += *pV++; | ||||
|         cnt--; | ||||
|     } | ||||
| } | ||||
|  | ||||
| #endif                          // ARM_MATH_DSP | ||||
|  | ||||
| /** | ||||
|  *  @ingroup groupNN | ||||
|  */ | ||||
|  | ||||
| /** | ||||
|  * @addtogroup Pooling | ||||
|  * @{ | ||||
|  */ | ||||
|  | ||||
|   /** | ||||
|    * @brief Q7 max pooling function | ||||
|    * @param[in, out]  Im_in       pointer to input tensor | ||||
|    * @param[in]       dim_im_in   input tensor dimention | ||||
|    * @param[in]       ch_im_in    number of input tensor channels | ||||
|    * @param[in]       dim_kernel  filter kernel size | ||||
|    * @param[in]       padding     padding sizes | ||||
|    * @param[in]       stride      convolution stride | ||||
|    * @param[in]       dim_im_out  output tensor dimension | ||||
|    * @param[in,out]   bufferA     pointer to buffer space for input | ||||
|    * @param[in,out]   Im_out      pointer to output tensor | ||||
|    * @return none. | ||||
|    * | ||||
|    * @details | ||||
|    * | ||||
|    * <b>Buffer size:</b> | ||||
|    * | ||||
|    * bufferA size:  0 | ||||
|    * | ||||
|    * The pooling function is implemented as split x-pooling then | ||||
|    * y-pooling. | ||||
|    * | ||||
|    * This pooling function is input-destructive. Input data is undefined | ||||
|    * after calling this function. | ||||
|    * | ||||
|    */ | ||||
|  | ||||
| void | ||||
| arm_maxpool_q7_HWC(q7_t * Im_in, | ||||
|                    const uint16_t dim_im_in, | ||||
|                    const uint16_t ch_im_in, | ||||
|                    const uint16_t dim_kernel, | ||||
|                    const uint16_t padding, | ||||
|                    const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out) | ||||
| { | ||||
|  | ||||
| #if defined (ARM_MATH_DSP) | ||||
|     /* Run the following code for Cortex-M4 and Cortex-M7 */ | ||||
|  | ||||
|     int16_t   i_x, i_y; | ||||
|  | ||||
|     /* first does the pooling along x axis */ | ||||
|     for (i_y = 0; i_y < dim_im_in; i_y++) | ||||
|     { | ||||
|  | ||||
|         for (i_x = 0; i_x < dim_im_out; i_x++) | ||||
|         { | ||||
|             /* for each output pixel */ | ||||
|             q7_t     *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; | ||||
|             q7_t     *win_start; | ||||
|             q7_t     *win_stop; | ||||
|             if (i_x * stride - padding < 0) | ||||
|             { | ||||
|                 win_start = target; | ||||
|             } else | ||||
|             { | ||||
|                 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; | ||||
|             } | ||||
|  | ||||
|             if (i_x * stride - padding + dim_kernel >= dim_im_in) | ||||
|             { | ||||
|                 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; | ||||
|             } else | ||||
|             { | ||||
|                 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; | ||||
|             } | ||||
|  | ||||
|             /* first step is to copy over initial data */ | ||||
|             /* arm_copy_q7(win_start, target, ch_im_in); */ | ||||
|             memmove(target, win_start, ch_im_in); | ||||
|  | ||||
|             /* start the max operation from the second part */ | ||||
|             win_start += ch_im_in; | ||||
|             for (; win_start < win_stop; win_start += ch_im_in) | ||||
|             { | ||||
|                 compare_and_replace_if_larger_q7(target, win_start, ch_im_in); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /* then does the pooling along y axis */ | ||||
|     for (i_y = 0; i_y < dim_im_out; i_y++) | ||||
|     { | ||||
|  | ||||
|         /* for each output row */ | ||||
|         q7_t     *target = Im_out + i_y * dim_im_out * ch_im_in; | ||||
|         q7_t     *row_start; | ||||
|         q7_t     *row_end; | ||||
|         /* setting the starting row */ | ||||
|         if (i_y * stride - padding < 0) | ||||
|         { | ||||
|             row_start = Im_in; | ||||
|         } else | ||||
|         { | ||||
|             row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; | ||||
|         } | ||||
|         /* setting the stopping row */ | ||||
|         if (i_y * stride - padding + dim_kernel >= dim_im_in) | ||||
|         { | ||||
|             row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; | ||||
|         } else | ||||
|         { | ||||
|             row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; | ||||
|         } | ||||
|  | ||||
|         /* copy over the first row */ | ||||
|         /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ | ||||
|         memmove(target, row_start, dim_im_out * ch_im_in); | ||||
|  | ||||
|         /* move over to next row */ | ||||
|         row_start += ch_im_in * dim_im_in; | ||||
|  | ||||
|         for (; row_start < row_end; row_start += dim_im_in * ch_im_in) | ||||
|         { | ||||
|             compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in); | ||||
|         } | ||||
|     } | ||||
|  | ||||
| #else | ||||
|     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ | ||||
|  | ||||
|     int16_t   i_ch_in, i_x, i_y; | ||||
|     int16_t   k_x, k_y; | ||||
|  | ||||
|     for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) | ||||
|     { | ||||
|         for (i_y = 0; i_y < dim_im_out; i_y++) | ||||
|         { | ||||
|             for (i_x = 0; i_x < dim_im_out; i_x++) | ||||
|             { | ||||
|                 int       max = -129; | ||||
|                 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) | ||||
|                 { | ||||
|                     for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) | ||||
|                     { | ||||
|                         if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) | ||||
|                         { | ||||
|                             if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max) | ||||
|                             { | ||||
|                                 max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
| #endif                          /* ARM_MATH_DSP */ | ||||
|  | ||||
| } | ||||
|  | ||||
|   /** | ||||
|    * @brief Q7 average pooling function | ||||
|    * @param[in,out]   Im_in       pointer to input tensor | ||||
|    * @param[in]       dim_im_in   input tensor dimention | ||||
|    * @param[in]       ch_im_in    number of input tensor channels | ||||
|    * @param[in]       dim_kernel  filter kernel size | ||||
|    * @param[in]       padding     padding sizes | ||||
|    * @param[in]       stride      convolution stride | ||||
|    * @param[in]       dim_im_out  output tensor dimension | ||||
|    * @param[in,out]   bufferA     pointer to buffer space for input | ||||
|    * @param[in,out]   Im_out      pointer to output tensor | ||||
|    * @return none. | ||||
|    * | ||||
|    * @details | ||||
|    * | ||||
|    * <b>Buffer size:</b> | ||||
|    * | ||||
|    * bufferA size:  2*dim_im_out*ch_im_in | ||||
|    * | ||||
|    * The pooling function is implemented as split x-pooling then | ||||
|    * y-pooling. | ||||
|    * | ||||
|    * This pooling function is input-destructive. Input data is undefined | ||||
|    * after calling this function. | ||||
|    * | ||||
|    */ | ||||
|  | ||||
| void | ||||
| arm_avepool_q7_HWC(q7_t * Im_in, | ||||
|                    const uint16_t dim_im_in, | ||||
|                    const uint16_t ch_im_in, | ||||
|                    const uint16_t dim_kernel, | ||||
|                    const uint16_t padding, | ||||
|                    const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out) | ||||
| { | ||||
|  | ||||
| #if defined (ARM_MATH_DSP) | ||||
|     /* Run the following code for Cortex-M4 and Cortex-M7 */ | ||||
|  | ||||
|     q15_t    *buffer = (q15_t *) bufferA; | ||||
|     int16_t   i_x, i_y; | ||||
|     int16_t   count = 0; | ||||
|  | ||||
|     /* first does the pooling along x axis */ | ||||
|     for (i_y = 0; i_y < dim_im_in; i_y++) | ||||
|     { | ||||
|  | ||||
|         for (i_x = 0; i_x < dim_im_out; i_x++) | ||||
|         { | ||||
|             /* for each output pixel */ | ||||
|             q7_t     *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; | ||||
|             q7_t     *win_start; | ||||
|             q7_t     *win_stop; | ||||
|             if (i_x * stride - padding < 0) | ||||
|             { | ||||
|                 win_start = target; | ||||
|             } else | ||||
|             { | ||||
|                 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; | ||||
|             } | ||||
|  | ||||
|             if (i_x * stride - padding + dim_kernel >= dim_im_in) | ||||
|             { | ||||
|                 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; | ||||
|             } else | ||||
|             { | ||||
|                 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; | ||||
|             } | ||||
|  | ||||
|             /* first step is to copy over initial data */ | ||||
|             arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in); | ||||
|             count = 1; | ||||
|  | ||||
|             /* start the max operation from the second part */ | ||||
|             win_start += ch_im_in; | ||||
|             for (; win_start < win_stop; win_start += ch_im_in) | ||||
|             { | ||||
|                 accumulate_q7_to_q15(buffer, win_start, ch_im_in); | ||||
|                 count++; | ||||
|             } | ||||
|             buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /* then does the pooling along y axis */ | ||||
|     for (i_y = 0; i_y < dim_im_out; i_y++) | ||||
|     { | ||||
|         /* for each output row */ | ||||
|         q7_t     *target = Im_out + i_y * dim_im_out * ch_im_in; | ||||
|         q7_t     *row_start; | ||||
|         q7_t     *row_end; | ||||
|         /* setting the starting row */ | ||||
|         if (i_y * stride - padding < 0) | ||||
|         { | ||||
|             row_start = Im_in; | ||||
|         } else | ||||
|         { | ||||
|             row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; | ||||
|         } | ||||
|         /* setting the stopping row */ | ||||
|         if (i_y * stride - padding + dim_kernel >= dim_im_in) | ||||
|         { | ||||
|             row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; | ||||
|         } else | ||||
|         { | ||||
|             row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; | ||||
|         } | ||||
|  | ||||
|         /* copy over the first row */ | ||||
|         arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in); | ||||
|         count = 1; | ||||
|  | ||||
|         /* move over to next row */ | ||||
|         row_start += ch_im_in * dim_im_in; | ||||
|  | ||||
|         for (; row_start < row_end; row_start += dim_im_in * ch_im_in) | ||||
|         { | ||||
|             accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in); | ||||
|             count++; | ||||
|         } | ||||
|         buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count); | ||||
|     } | ||||
|  | ||||
| #else | ||||
|     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ | ||||
|  | ||||
|     int16_t   i_ch_in, i_x, i_y; | ||||
|     int16_t   k_x, k_y; | ||||
|  | ||||
|     for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) | ||||
|     { | ||||
|         for (i_y = 0; i_y < dim_im_out; i_y++) | ||||
|         { | ||||
|             for (i_x = 0; i_x < dim_im_out; i_x++) | ||||
|             { | ||||
|                 int       sum = 0; | ||||
|                 int       count = 0; | ||||
|                 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) | ||||
|                 { | ||||
|                     for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) | ||||
|                     { | ||||
|                         if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) | ||||
|                         { | ||||
|                             sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; | ||||
|                             count++; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
| #endif                          /* ARM_MATH_DSP */ | ||||
|  | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * @} end of Pooling group | ||||
|  */ | ||||
		Reference in New Issue
	
	Block a user