大家好,又见面了,我是你们的朋友全栈君。
SIMD 一条指令可以执行多个数据group的计算和输出。对于SIMD相对应的SISD.
intel SSE2 , AVX2, AVX-512
假设有一个任务是统计字符串中每一个字符出现的次数,我们可以用128bit 的SISD指令进行统计。每8个bit代表一个字符,所以只需要两个SIMD指令(move mask、pop count)。
详细测试:
#include <stdio.h>
#include <thread>
#define INC_TO 1000000 // one million...
#include <mutex>
#include <functional>
#include <atomic>
#include <vector>
#include <sstream>
#include <iostream>
#include <emmintrin.h>
#include <immintrin.h>
#include <assert.h>
#include <x86intrin.h>
struct StringView {
char* buffer;
size_t len;
};
void RandomGeneratorFile(const char* filename) {
FILE* fp = fopen(filename, "w");
const size_t numbers = 16*8*1000;
size_t count =0;
do {
for (char i = 'A'; i < 'Z'; i++) {
fputc(i, fp);
}
count++;
} while (count < numbers);
fclose(fp);
}
StringView* GetFileContent(const char* filename) {
FILE* fp = fopen(filename, "r");
/*Move file point at the end of file.*/
fseek(fp,0,SEEK_END);
/*Get the current position of the file pointer.*/
size_t size=ftell(fp);
printf("file size:%d\n", size);
char * buffer = new char[size];
fseek(fp, 0, SEEK_SET);
fread(buffer, size, size, fp);
// printf("content of buffer:%s\n", buffer);
fclose(fp);
StringView* str = new StringView();
str->buffer = buffer;
str->len = size;
return str;
}
// 正常统计字符串
size_t count_chars_8(const char* data, size_t size, const char ch)
{
size_t total = 0;
while (size) {
if (*data == ch)
total += 1;
data += 1;
size -= 1;
}
return total;
}
// SIMD
size_t count_chars_128(const char* data, size_t size, const char ch)
{
size_t total = 0;
assert(size % 16 == 0);
// 将ch广播16次
__m128i tocmp = _mm_set1_epi8(ch);
while (size) {
int mask = 0;
// 从memory 取出128bit数据
__m128i chunk = _mm_load_si128 ((__m128i const*)data);
// 对128bit数据进行比较, 返回16bit
__m128i results = _mm_cmpeq_epi8(chunk, tocmp);
//
mask = _mm_movemask_epi8(results);
//统计int32的bit位是1的值
// _mm_ prefix, because it does not operate on 128-bit registers, it just operates on standard 64-bit registers.
total += _popcnt32(mask);
data += 16;
size -= 16;
}
return total;
}
// AVX
size_t count_chars_avx(const char* data, size_t size, const char ch)
{
size_t total = 0;
assert(size % 16 == 0);
__m256i tocmp = _mm256_set1_epi8(ch);
while(size) {
__m256i chunk = _mm256_loadu_si256((__m256i*)data);
__m256i results = _mm256_cmpeq_epi8(tocmp, chunk);
unsigned mask = _mm256_movemask_epi8(results);
total += __builtin_popcount(mask);
data += 32;
size -= 32;
}
// printf("count:%d\n", total);
return total;
}
static void print_time_us(const char* name,
size_t(*fn)(const char*, size_t, const char), const char* a,
size_t size, const char ch) {
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
fn(a, size, ch);
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
uint64_t delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
printf("Running: '%s' took %llu u/s\n", name, delta_us);
}
int main()
{
RandomGeneratorFile("test.file");
StringView* str = GetFileContent("test.file");
print_time_us("NORMAL", count_chars_8, str->buffer, str->len, 'A');
print_time_us("SIMD", count_chars_128, str->buffer, str->len, 'A');
print_time_us("AVX", count_chars_avx, str->buffer, str->len, 'A');
return 0;
}
编译命令:g++ -std=c++14 main.cc -o main -mavx -mavx2 -O2
输出:
file size:3200000
Running: 'NORMAL' took 2505 u/s
Running: 'SIMD' took 192 u/s
Running: 'AVX' took 97 u/s
发布者:全栈程序员-用户IM,转载请注明出处:https://javaforall.cn/139752.html原文链接:https://javaforall.cn
【正版授权,激活自己账号】: Jetbrains全家桶Ide使用,1年售后保障,每天仅需1毛
【官方授权 正版激活】: 官方授权 正版激活 支持Jetbrains家族下所有IDE 使用个人JB账号...