@@ -133,4 +133,145 @@ int sgn(int X){
133
133
if (X > 0 ) return 1 ;
134
134
if (X < 0 ) return -1 ;
135
135
return 0 ;
136
- }
136
+ }
137
+
138
+ // 函数12: 获取某个整形变量对应的颜色值
139
+ // 参考: 无
140
+ // 简介: 无
141
+ void GetRGB (int Color, int *R, int *G, int *B){
142
+ *R = Color & 255 ;
143
+ *G = (Color & 65280 ) / 256 ;
144
+ *B = (Color & 16711680 ) / 65536 ;
145
+ }
146
+
147
+ // 函数13: 牛顿法近似获取指定数字的算法平方根
148
+ // 参考: https://www.cnblogs.com/qlky/p/7735145.html
149
+ // 简介: 仍然是近似算法,近似出了指定数字的平方根
150
+ float Sqrt (float X)
151
+ {
152
+ float HalfX = 0 .5f * X; // 对double类型的数字无效
153
+ int I = *(int *)&X; // get bits for floating VALUE
154
+ I = 0x5f375a86 - (I >> 1 ); // gives initial guess y0
155
+ X = *(float *)&I; // convert bits BACK to float
156
+ X = X * (1 .5f - HalfX * X * X); // Newton step, repeating increases accuracy
157
+ X = X * (1 .5f - HalfX * X * X); // Newton step, repeating increases accuracy
158
+ X = X * (1 .5f - HalfX * X * X); // Newton step, repeating increases accuracy
159
+ return 1 / X;
160
+ }
161
+
162
+ // 函数14: 无符号短整形直方图数据相加,即是Y = X + Y
163
+ // 参考: 无
164
+ // 简介: SSE优化
165
+ void HistgramAddShort (unsigned short *X, unsigned short *Y)
166
+ {
167
+ *(__m128i*)(Y + 0 ) = _mm_add_epi16 (*(__m128i*)&Y[0 ], *(__m128i*)&X[0 ]); // 不要想着用自己写的汇编超过他的速度了,已经试过了
168
+ *(__m128i*)(Y + 8 ) = _mm_add_epi16 (*(__m128i*)&Y[8 ], *(__m128i*)&X[8 ]);
169
+ *(__m128i*)(Y + 16 ) = _mm_add_epi16 (*(__m128i*)&Y[16 ], *(__m128i*)&X[16 ]);
170
+ *(__m128i*)(Y + 24 ) = _mm_add_epi16 (*(__m128i*)&Y[24 ], *(__m128i*)&X[24 ]);
171
+ *(__m128i*)(Y + 32 ) = _mm_add_epi16 (*(__m128i*)&Y[32 ], *(__m128i*)&X[32 ]);
172
+ *(__m128i*)(Y + 40 ) = _mm_add_epi16 (*(__m128i*)&Y[40 ], *(__m128i*)&X[40 ]);
173
+ *(__m128i*)(Y + 48 ) = _mm_add_epi16 (*(__m128i*)&Y[48 ], *(__m128i*)&X[48 ]);
174
+ *(__m128i*)(Y + 56 ) = _mm_add_epi16 (*(__m128i*)&Y[56 ], *(__m128i*)&X[56 ]);
175
+ *(__m128i*)(Y + 64 ) = _mm_add_epi16 (*(__m128i*)&Y[64 ], *(__m128i*)&X[64 ]);
176
+ *(__m128i*)(Y + 72 ) = _mm_add_epi16 (*(__m128i*)&Y[72 ], *(__m128i*)&X[72 ]);
177
+ *(__m128i*)(Y + 80 ) = _mm_add_epi16 (*(__m128i*)&Y[80 ], *(__m128i*)&X[80 ]);
178
+ *(__m128i*)(Y + 88 ) = _mm_add_epi16 (*(__m128i*)&Y[88 ], *(__m128i*)&X[88 ]);
179
+ *(__m128i*)(Y + 96 ) = _mm_add_epi16 (*(__m128i*)&Y[96 ], *(__m128i*)&X[96 ]);
180
+ *(__m128i*)(Y + 104 ) = _mm_add_epi16 (*(__m128i*)&Y[104 ], *(__m128i*)&X[104 ]);
181
+ *(__m128i*)(Y + 112 ) = _mm_add_epi16 (*(__m128i*)&Y[112 ], *(__m128i*)&X[112 ]);
182
+ *(__m128i*)(Y + 120 ) = _mm_add_epi16 (*(__m128i*)&Y[120 ], *(__m128i*)&X[120 ]);
183
+ *(__m128i*)(Y + 128 ) = _mm_add_epi16 (*(__m128i*)&Y[128 ], *(__m128i*)&X[128 ]);
184
+ *(__m128i*)(Y + 136 ) = _mm_add_epi16 (*(__m128i*)&Y[136 ], *(__m128i*)&X[136 ]);
185
+ *(__m128i*)(Y + 144 ) = _mm_add_epi16 (*(__m128i*)&Y[144 ], *(__m128i*)&X[144 ]);
186
+ *(__m128i*)(Y + 152 ) = _mm_add_epi16 (*(__m128i*)&Y[152 ], *(__m128i*)&X[152 ]);
187
+ *(__m128i*)(Y + 160 ) = _mm_add_epi16 (*(__m128i*)&Y[160 ], *(__m128i*)&X[160 ]);
188
+ *(__m128i*)(Y + 168 ) = _mm_add_epi16 (*(__m128i*)&Y[168 ], *(__m128i*)&X[168 ]);
189
+ *(__m128i*)(Y + 176 ) = _mm_add_epi16 (*(__m128i*)&Y[176 ], *(__m128i*)&X[176 ]);
190
+ *(__m128i*)(Y + 184 ) = _mm_add_epi16 (*(__m128i*)&Y[184 ], *(__m128i*)&X[184 ]);
191
+ *(__m128i*)(Y + 192 ) = _mm_add_epi16 (*(__m128i*)&Y[192 ], *(__m128i*)&X[192 ]);
192
+ *(__m128i*)(Y + 200 ) = _mm_add_epi16 (*(__m128i*)&Y[200 ], *(__m128i*)&X[200 ]);
193
+ *(__m128i*)(Y + 208 ) = _mm_add_epi16 (*(__m128i*)&Y[208 ], *(__m128i*)&X[208 ]);
194
+ *(__m128i*)(Y + 216 ) = _mm_add_epi16 (*(__m128i*)&Y[216 ], *(__m128i*)&X[216 ]);
195
+ *(__m128i*)(Y + 224 ) = _mm_add_epi16 (*(__m128i*)&Y[224 ], *(__m128i*)&X[224 ]);
196
+ *(__m128i*)(Y + 232 ) = _mm_add_epi16 (*(__m128i*)&Y[232 ], *(__m128i*)&X[232 ]);
197
+ *(__m128i*)(Y + 240 ) = _mm_add_epi16 (*(__m128i*)&Y[240 ], *(__m128i*)&X[240 ]);
198
+ *(__m128i*)(Y + 248 ) = _mm_add_epi16 (*(__m128i*)&Y[248 ], *(__m128i*)&X[248 ]);
199
+ }
200
+
201
+ // 函数15: 无符号短整形直方图数据相减,即是Y = Y - X
202
+ // 参考: 无
203
+ // 简介: SSE优化
204
+ void HistgramSubShort (unsigned short *X, unsigned short *Y)
205
+ {
206
+ *(__m128i*)(Y + 0 ) = _mm_sub_epi16 (*(__m128i*)&Y[0 ], *(__m128i*)&X[0 ]);
207
+ *(__m128i*)(Y + 8 ) = _mm_sub_epi16 (*(__m128i*)&Y[8 ], *(__m128i*)&X[8 ]);
208
+ *(__m128i*)(Y + 16 ) = _mm_sub_epi16 (*(__m128i*)&Y[16 ], *(__m128i*)&X[16 ]);
209
+ *(__m128i*)(Y + 24 ) = _mm_sub_epi16 (*(__m128i*)&Y[24 ], *(__m128i*)&X[24 ]);
210
+ *(__m128i*)(Y + 32 ) = _mm_sub_epi16 (*(__m128i*)&Y[32 ], *(__m128i*)&X[32 ]);
211
+ *(__m128i*)(Y + 40 ) = _mm_sub_epi16 (*(__m128i*)&Y[40 ], *(__m128i*)&X[40 ]);
212
+ *(__m128i*)(Y + 48 ) = _mm_sub_epi16 (*(__m128i*)&Y[48 ], *(__m128i*)&X[48 ]);
213
+ *(__m128i*)(Y + 56 ) = _mm_sub_epi16 (*(__m128i*)&Y[56 ], *(__m128i*)&X[56 ]);
214
+ *(__m128i*)(Y + 64 ) = _mm_sub_epi16 (*(__m128i*)&Y[64 ], *(__m128i*)&X[64 ]);
215
+ *(__m128i*)(Y + 72 ) = _mm_sub_epi16 (*(__m128i*)&Y[72 ], *(__m128i*)&X[72 ]);
216
+ *(__m128i*)(Y + 80 ) = _mm_sub_epi16 (*(__m128i*)&Y[80 ], *(__m128i*)&X[80 ]);
217
+ *(__m128i*)(Y + 88 ) = _mm_sub_epi16 (*(__m128i*)&Y[88 ], *(__m128i*)&X[88 ]);
218
+ *(__m128i*)(Y + 96 ) = _mm_sub_epi16 (*(__m128i*)&Y[96 ], *(__m128i*)&X[96 ]);
219
+ *(__m128i*)(Y + 104 ) = _mm_sub_epi16 (*(__m128i*)&Y[104 ], *(__m128i*)&X[104 ]);
220
+ *(__m128i*)(Y + 112 ) = _mm_sub_epi16 (*(__m128i*)&Y[112 ], *(__m128i*)&X[112 ]);
221
+ *(__m128i*)(Y + 120 ) = _mm_sub_epi16 (*(__m128i*)&Y[120 ], *(__m128i*)&X[120 ]);
222
+ *(__m128i*)(Y + 128 ) = _mm_sub_epi16 (*(__m128i*)&Y[128 ], *(__m128i*)&X[128 ]);
223
+ *(__m128i*)(Y + 136 ) = _mm_sub_epi16 (*(__m128i*)&Y[136 ], *(__m128i*)&X[136 ]);
224
+ *(__m128i*)(Y + 144 ) = _mm_sub_epi16 (*(__m128i*)&Y[144 ], *(__m128i*)&X[144 ]);
225
+ *(__m128i*)(Y + 152 ) = _mm_sub_epi16 (*(__m128i*)&Y[152 ], *(__m128i*)&X[152 ]);
226
+ *(__m128i*)(Y + 160 ) = _mm_sub_epi16 (*(__m128i*)&Y[160 ], *(__m128i*)&X[160 ]);
227
+ *(__m128i*)(Y + 168 ) = _mm_sub_epi16 (*(__m128i*)&Y[168 ], *(__m128i*)&X[168 ]);
228
+ *(__m128i*)(Y + 176 ) = _mm_sub_epi16 (*(__m128i*)&Y[176 ], *(__m128i*)&X[176 ]);
229
+ *(__m128i*)(Y + 184 ) = _mm_sub_epi16 (*(__m128i*)&Y[184 ], *(__m128i*)&X[184 ]);
230
+ *(__m128i*)(Y + 192 ) = _mm_sub_epi16 (*(__m128i*)&Y[192 ], *(__m128i*)&X[192 ]);
231
+ *(__m128i*)(Y + 200 ) = _mm_sub_epi16 (*(__m128i*)&Y[200 ], *(__m128i*)&X[200 ]);
232
+ *(__m128i*)(Y + 208 ) = _mm_sub_epi16 (*(__m128i*)&Y[208 ], *(__m128i*)&X[208 ]);
233
+ *(__m128i*)(Y + 216 ) = _mm_sub_epi16 (*(__m128i*)&Y[216 ], *(__m128i*)&X[216 ]);
234
+ *(__m128i*)(Y + 224 ) = _mm_sub_epi16 (*(__m128i*)&Y[224 ], *(__m128i*)&X[224 ]);
235
+ *(__m128i*)(Y + 232 ) = _mm_sub_epi16 (*(__m128i*)&Y[232 ], *(__m128i*)&X[232 ]);
236
+ *(__m128i*)(Y + 240 ) = _mm_sub_epi16 (*(__m128i*)&Y[240 ], *(__m128i*)&X[240 ]);
237
+ *(__m128i*)(Y + 248 ) = _mm_sub_epi16 (*(__m128i*)&Y[248 ], *(__m128i*)&X[248 ]);
238
+ }
239
+
240
+ // 函数16: 无符号短整形直方图数据相加减,即是Z = Z + Y - X
241
+ // 参考: 无
242
+ // 简介: SSE优化
243
+ void HistgramSubAddShort (unsigned short *X, unsigned short *Y, unsigned short *Z)
244
+ {
245
+ *(__m128i*)(Z + 0 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[0 ], *(__m128i*)&Z[0 ]), *(__m128i*)&X[0 ]); // 不要想着用自己写的汇编超过他的速度了,已经试过了
246
+ *(__m128i*)(Z + 8 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[8 ], *(__m128i*)&Z[8 ]), *(__m128i*)&X[8 ]);
247
+ *(__m128i*)(Z + 16 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[16 ], *(__m128i*)&Z[16 ]), *(__m128i*)&X[16 ]);
248
+ *(__m128i*)(Z + 24 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[24 ], *(__m128i*)&Z[24 ]), *(__m128i*)&X[24 ]);
249
+ *(__m128i*)(Z + 32 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[32 ], *(__m128i*)&Z[32 ]), *(__m128i*)&X[32 ]);
250
+ *(__m128i*)(Z + 40 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[40 ], *(__m128i*)&Z[40 ]), *(__m128i*)&X[40 ]);
251
+ *(__m128i*)(Z + 48 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[48 ], *(__m128i*)&Z[48 ]), *(__m128i*)&X[48 ]);
252
+ *(__m128i*)(Z + 56 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[56 ], *(__m128i*)&Z[56 ]), *(__m128i*)&X[56 ]);
253
+ *(__m128i*)(Z + 64 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[64 ], *(__m128i*)&Z[64 ]), *(__m128i*)&X[64 ]);
254
+ *(__m128i*)(Z + 72 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[72 ], *(__m128i*)&Z[72 ]), *(__m128i*)&X[72 ]);
255
+ *(__m128i*)(Z + 80 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[80 ], *(__m128i*)&Z[80 ]), *(__m128i*)&X[80 ]);
256
+ *(__m128i*)(Z + 88 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[88 ], *(__m128i*)&Z[88 ]), *(__m128i*)&X[88 ]);
257
+ *(__m128i*)(Z + 96 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[96 ], *(__m128i*)&Z[96 ]), *(__m128i*)&X[96 ]);
258
+ *(__m128i*)(Z + 104 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[104 ], *(__m128i*)&Z[104 ]), *(__m128i*)&X[104 ]);
259
+ *(__m128i*)(Z + 112 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[112 ], *(__m128i*)&Z[112 ]), *(__m128i*)&X[112 ]);
260
+ *(__m128i*)(Z + 120 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[120 ], *(__m128i*)&Z[120 ]), *(__m128i*)&X[120 ]);
261
+ *(__m128i*)(Z + 128 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[128 ], *(__m128i*)&Z[128 ]), *(__m128i*)&X[128 ]);
262
+ *(__m128i*)(Z + 136 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[136 ], *(__m128i*)&Z[136 ]), *(__m128i*)&X[136 ]);
263
+ *(__m128i*)(Z + 144 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[144 ], *(__m128i*)&Z[144 ]), *(__m128i*)&X[144 ]);
264
+ *(__m128i*)(Z + 152 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[152 ], *(__m128i*)&Z[152 ]), *(__m128i*)&X[152 ]);
265
+ *(__m128i*)(Z + 160 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[160 ], *(__m128i*)&Z[160 ]), *(__m128i*)&X[160 ]);
266
+ *(__m128i*)(Z + 168 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[168 ], *(__m128i*)&Z[168 ]), *(__m128i*)&X[168 ]);
267
+ *(__m128i*)(Z + 176 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[176 ], *(__m128i*)&Z[176 ]), *(__m128i*)&X[176 ]);
268
+ *(__m128i*)(Z + 184 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[184 ], *(__m128i*)&Z[184 ]), *(__m128i*)&X[184 ]);
269
+ *(__m128i*)(Z + 192 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[192 ], *(__m128i*)&Z[192 ]), *(__m128i*)&X[192 ]);
270
+ *(__m128i*)(Z + 200 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[200 ], *(__m128i*)&Z[200 ]), *(__m128i*)&X[200 ]);
271
+ *(__m128i*)(Z + 208 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[208 ], *(__m128i*)&Z[208 ]), *(__m128i*)&X[208 ]);
272
+ *(__m128i*)(Z + 216 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[216 ], *(__m128i*)&Z[216 ]), *(__m128i*)&X[216 ]);
273
+ *(__m128i*)(Z + 224 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[224 ], *(__m128i*)&Z[224 ]), *(__m128i*)&X[224 ]);
274
+ *(__m128i*)(Z + 232 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[232 ], *(__m128i*)&Z[232 ]), *(__m128i*)&X[232 ]);
275
+ *(__m128i*)(Z + 240 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[240 ], *(__m128i*)&Z[240 ]), *(__m128i*)&X[240 ]);
276
+ *(__m128i*)(Z + 248 ) = _mm_sub_epi16 (_mm_add_epi16 (*(__m128i*)&Y[248 ], *(__m128i*)&Z[248 ]), *(__m128i*)&X[248 ]);
277
+ }
0 commit comments