7 InnerProduct(
const void *pVect1,
const void *pVect2,
const void *qty_ptr) {
8 size_t qty = *((
size_t *) qty_ptr);
10 for (
unsigned i = 0; i < qty; i++) {
11 res += ((
float *) pVect1)[i] * ((
float *) pVect2)[i];
26 InnerProductSIMD4ExtAVX(
const void *pVect1v,
const void *pVect2v,
const void *qty_ptr) {
27 float PORTABLE_ALIGN32 TmpRes[8];
28 float *pVect1 = (
float *) pVect1v;
29 float *pVect2 = (
float *) pVect2v;
30 size_t qty = *((
size_t *) qty_ptr);
32 size_t qty16 = qty / 16;
33 size_t qty4 = qty / 4;
35 const float *pEnd1 = pVect1 + 16 * qty16;
36 const float *pEnd2 = pVect1 + 4 * qty4;
38 __m256 sum256 = _mm256_set1_ps(0);
40 while (pVect1 < pEnd1) {
43 __m256 v1 = _mm256_loadu_ps(pVect1);
45 __m256 v2 = _mm256_loadu_ps(pVect2);
47 sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
49 v1 = _mm256_loadu_ps(pVect1);
51 v2 = _mm256_loadu_ps(pVect2);
53 sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
57 __m128 sum_prod = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
59 while (pVect1 < pEnd2) {
60 v1 = _mm_loadu_ps(pVect1);
62 v2 = _mm_loadu_ps(pVect2);
64 sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
67 _mm_store_ps(TmpRes, sum_prod);
68 float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];;
73 InnerProductDistanceSIMD4ExtAVX(
const void *pVect1v,
const void *pVect2v,
const void *qty_ptr) {
74 return 1.0f - InnerProductSIMD4ExtAVX(pVect1v, pVect2v, qty_ptr);
82 InnerProductSIMD4ExtSSE(
const void *pVect1v,
const void *pVect2v,
const void *qty_ptr) {
83 float PORTABLE_ALIGN32 TmpRes[8];
84 float *pVect1 = (
float *) pVect1v;
85 float *pVect2 = (
float *) pVect2v;
86 size_t qty = *((
size_t *) qty_ptr);
88 size_t qty16 = qty / 16;
89 size_t qty4 = qty / 4;
91 const float *pEnd1 = pVect1 + 16 * qty16;
92 const float *pEnd2 = pVect1 + 4 * qty4;
95 __m128 sum_prod = _mm_set1_ps(0);
97 while (pVect1 < pEnd1) {
98 v1 = _mm_loadu_ps(pVect1);
100 v2 = _mm_loadu_ps(pVect2);
102 sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
104 v1 = _mm_loadu_ps(pVect1);
106 v2 = _mm_loadu_ps(pVect2);
108 sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
110 v1 = _mm_loadu_ps(pVect1);
112 v2 = _mm_loadu_ps(pVect2);
114 sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
116 v1 = _mm_loadu_ps(pVect1);
118 v2 = _mm_loadu_ps(pVect2);
120 sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
123 while (pVect1 < pEnd2) {
124 v1 = _mm_loadu_ps(pVect1);
126 v2 = _mm_loadu_ps(pVect2);
128 sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
131 _mm_store_ps(TmpRes, sum_prod);
132 float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
138 InnerProductDistanceSIMD4ExtSSE(
const void *pVect1v,
const void *pVect2v,
const void *qty_ptr) {
139 return 1.0f - InnerProductSIMD4ExtSSE(pVect1v, pVect2v, qty_ptr);
145#if defined(USE_AVX512)
148 InnerProductSIMD16ExtAVX512(
const void *pVect1v,
const void *pVect2v,
const void *qty_ptr) {
149 float PORTABLE_ALIGN64 TmpRes[16];
150 float *pVect1 = (
float *) pVect1v;
151 float *pVect2 = (
float *) pVect2v;
152 size_t qty = *((
size_t *) qty_ptr);
154 size_t qty16 = qty / 16;
157 const float *pEnd1 = pVect1 + 16 * qty16;
159 __m512 sum512 = _mm512_set1_ps(0);
161 while (pVect1 < pEnd1) {
164 __m512 v1 = _mm512_loadu_ps(pVect1);
166 __m512 v2 = _mm512_loadu_ps(pVect2);
168 sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2));
171 _mm512_store_ps(TmpRes, sum512);
172 float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15];
178 InnerProductDistanceSIMD16ExtAVX512(
const void *pVect1v,
const void *pVect2v,
const void *qty_ptr) {
179 return 1.0f - InnerProductSIMD16ExtAVX512(pVect1v, pVect2v, qty_ptr);
187 InnerProductSIMD16ExtAVX(
const void *pVect1v,
const void *pVect2v,
const void *qty_ptr) {
188 float PORTABLE_ALIGN32 TmpRes[8];
189 float *pVect1 = (
float *) pVect1v;
190 float *pVect2 = (
float *) pVect2v;
191 size_t qty = *((
size_t *) qty_ptr);
193 size_t qty16 = qty / 16;
196 const float *pEnd1 = pVect1 + 16 * qty16;
198 __m256 sum256 = _mm256_set1_ps(0);
200 while (pVect1 < pEnd1) {
203 __m256 v1 = _mm256_loadu_ps(pVect1);
205 __m256 v2 = _mm256_loadu_ps(pVect2);
207 sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
209 v1 = _mm256_loadu_ps(pVect1);
211 v2 = _mm256_loadu_ps(pVect2);
213 sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
216 _mm256_store_ps(TmpRes, sum256);
217 float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
223 InnerProductDistanceSIMD16ExtAVX(
const void *pVect1v,
const void *pVect2v,
const void *qty_ptr) {
224 return 1.0f - InnerProductSIMD16ExtAVX(pVect1v, pVect2v, qty_ptr);
232 InnerProductSIMD16ExtSSE(
const void *pVect1v,
const void *pVect2v,
const void *qty_ptr) {
233 float PORTABLE_ALIGN32 TmpRes[8];
234 float *pVect1 = (
float *) pVect1v;
235 float *pVect2 = (
float *) pVect2v;
236 size_t qty = *((
size_t *) qty_ptr);
238 size_t qty16 = qty / 16;
240 const float *pEnd1 = pVect1 + 16 * qty16;
243 __m128 sum_prod = _mm_set1_ps(0);
245 while (pVect1 < pEnd1) {
246 v1 = _mm_loadu_ps(pVect1);
248 v2 = _mm_loadu_ps(pVect2);
250 sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
252 v1 = _mm_loadu_ps(pVect1);
254 v2 = _mm_loadu_ps(pVect2);
256 sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
258 v1 = _mm_loadu_ps(pVect1);
260 v2 = _mm_loadu_ps(pVect2);
262 sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
264 v1 = _mm_loadu_ps(pVect1);
266 v2 = _mm_loadu_ps(pVect2);
268 sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
270 _mm_store_ps(TmpRes, sum_prod);
271 float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
277 InnerProductDistanceSIMD16ExtSSE(
const void *pVect1v,
const void *pVect2v,
const void *qty_ptr) {
278 return 1.0f - InnerProductSIMD16ExtSSE(pVect1v, pVect2v, qty_ptr);
283#if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512)
286 DISTFUNC<float> InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtSSE;
287 DISTFUNC<float> InnerProductDistanceSIMD4Ext = InnerProductDistanceSIMD4ExtSSE;
290 InnerProductDistanceSIMD16ExtResiduals(
const void *pVect1v,
const void *pVect2v,
const void *qty_ptr) {
291 size_t qty = *((
size_t *) qty_ptr);
292 size_t qty16 = qty >> 4 << 4;
293 float res = InnerProductSIMD16Ext(pVect1v, pVect2v, &qty16);
294 float *pVect1 = (
float *) pVect1v + qty16;
295 float *pVect2 = (
float *) pVect2v + qty16;
297 size_t qty_left = qty - qty16;
298 float res_tail =
InnerProduct(pVect1, pVect2, &qty_left);
299 return 1.0f - (res + res_tail);
303 InnerProductDistanceSIMD4ExtResiduals(
const void *pVect1v,
const void *pVect2v,
const void *qty_ptr) {
304 size_t qty = *((
size_t *) qty_ptr);
305 size_t qty4 = qty >> 2 << 2;
307 float res = InnerProductSIMD4Ext(pVect1v, pVect2v, &qty4);
308 size_t qty_left = qty - qty4;
310 float *pVect1 = (
float *) pVect1v + qty4;
311 float *pVect2 = (
float *) pVect2v + qty4;
312 float res_tail =
InnerProduct(pVect1, pVect2, &qty_left);
314 return 1.0f - (res + res_tail);
326 #if defined(USE_AVX) || defined(USE_SSE) || defined(USE_AVX512)
327 #if defined(USE_AVX512)
328 if (AVX512Capable()) {
329 InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX512;
330 InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX512;
331 }
else if (AVXCapable()) {
332 InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
333 InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
335 #elif defined(USE_AVX)
337 InnerProductSIMD16Ext = InnerProductSIMD16ExtAVX;
338 InnerProductDistanceSIMD16Ext = InnerProductDistanceSIMD16ExtAVX;
343 InnerProductSIMD4Ext = InnerProductSIMD4ExtAVX;
344 InnerProductDistanceSIMD4Ext = InnerProductDistanceSIMD4ExtAVX;
350 else if (dim % 4 == 0)
Definition space_ip.h:318
DISTFUNC< float > fstdistfunc_
Definition space_ip.h:320
size_t dim_
Definition space_ip.h:322
size_t get_data_size()
Definition space_ip.h:361
void * get_dist_func_param()
Definition space_ip.h:369
InnerProductSpace(size_t dim)
Definition space_ip.h:324
~InnerProductSpace()
Definition space_ip.h:373
size_t data_size_
Definition space_ip.h:321
DISTFUNC< float > get_dist_func()
Definition space_ip.h:365
Definition bruteforce.h:7
MTYPE(*)(const void *, const void *, const void *) DISTFUNC
Definition hnswlib.h:138
static float InnerProduct(const void *pVect1, const void *pVect2, const void *qty_ptr)
Definition space_ip.h:7
static float InnerProductDistance(const void *pVect1, const void *pVect2, const void *qty_ptr)
Definition space_ip.h:18