c中的筛eratosthenes平行化

我刚刚从我的老师那里得到了C中Eratosthenes筛的并行化的解决方案。这些线是什么意思？

if (!(a[i / 64] & ((uint_fast64_t)1 << (i % 64))))       
    continue;

a[j / 64] &= ~(((uint_fast64_t)1) << (j % 64));

完整代码：

#include <math.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>

#pragma GCC optimize("unroll-loops")
#define N 100
#define K 7

uint_fast64_t a[N];

void *threadBehaviour(void *) {
    static uint16_t k = 0;
    k = (k + 1) % K;
    uint_fast64_t nb_max_i;
    for (uint_fast64_t i = 2; i <= sqrt(N); i += 1) {
        if (!(a[i/64 ] & ((uint_fast64_t)1 << (i %64 ))))
            continue;
        nb_max_i = ((N) - (i * i)) / i;
        uint_fast64_t exitCondition = i * i + (nb_max_i * (k + 1) / K) * i;
        for (uint_fast64_t j = i * i + (nb_max_i * k / K) * i; j <= exitCondition; j += i) {
            a[j / 64] &= ~(((uint_fast64_t)1) << (j % 64));
        }
    }
    pthread_exit(NULL);
}

int main() {
    pthread_t k[K];
    for (uint_fast64_t i = 0; i < N; i++) {
        a[i] = 0xAAAAAAAAAAAAAAAA;
    }
    for (uint16_t u = 0; u < K; u++) {
        pthread_create(&k[u], NULL, &threadBehaviour, NULL);
    }
    clock_t t;
    t = clock();
    for (uint16_t j = 0; j < K; j++) {
        pthread_join(k[j], NULL);
    }
    t = clock() - t;
    double time_taken = ((double)t) / CLOCKS_PER_SEC;
    printf("%f seconds", time_taken);

    for (uint_fast64_t i = 2; i < (N+1); i++) {
        if (a[i / 64] & ((uint_fast64_t)1 << i)) {
            printf("| %lu ", i);
        }
    }
    printf("\n");
    return 0;
}

特别是对于a[i/64]，我不明白这是怎么回事

测试(a[i / 64] & ((uint_fast64_t)1 << (i % 64)))检查是否在全局数组a的元素中设置了位。比特数是i的低6比特，元素数是i的高比特。外部循环的初始部分选择下一个素数，其倍数在数组的其余部分（或分配给该线程的切片）中被标记为复合。
类似地，a[j / 64] &= ~(((uint_fast64_t)1) << (j % 64));清除索引j的相应位，将j标记为合数。
此实现创建多个线程，每个线程更新数组的不同部分。然而，这种方法存在一些重大问题：
thread函数开头的这些行绝对可怕：

static uint16_t k = 0;
k = (k + 1) % K;

目的是确定当前线程修改了全局数组的哪一部分。然而，static变量k是由没有同步机制的并发线程修改的，并且这种修改不是原子的，因此不能保证每个线程都会获得不同的k值。相反，您应该使用适当的信息分配一个结构，并传递一个指针作为opaque参数，从而消除对全局或static变量的需求。
这种方法还有另一个主要问题：所有线程访问由第一线程并发修改的全局数组的开始。这充其量是丑陋的，而且可能是不正确的。
也不清楚为什么数组有N元素来计算N以下的素数，N / 64应该足够了。
j的初始值和最大值的表达式过于复杂，难以验证。我几乎可以肯定j的最后一个值在某些情况下可能会导致未定义的行为。
还要注意，clock()测量的是cpu时间，而不是运行时间。您应该使用timespec_get()、gettimeofday()或其他精确的时间函数。
以下是修改后的版本：

#include <math.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <sys/time.h>

//#pragma GCC optimize("unroll-loops")
struct sieve_args {
    uint64_t *a;
    uint64_t maxp, begin, end;
};

void *threadBehaviour(void *opaque) {
    struct sieve_args *args = opaque;
    uint64_t *a = args->a;

    for (uint64_t p = 3; p <= args->maxp; p += 2) {
        if (!(a[p / 64] & ((uint64_t)1 << (p % 64))))
            continue;
        /* clear odd multiples of p greater or equal to p * p
           in the slice as composites */
        uint64_t start = (args->begin + p - 1) / p * p;
        if (start < p * p)
            start = p * p;
        if (start % 2 == 0)
            start += p;
        uint64_t stop = args->end;
        for (uint64_t j = start; j < stop; j += p + p) {
            a[j / 64] &= ~((uint64_t)1 << (j % 64));
        }
    }
    pthread_exit(NULL);
}

/* cannot use clock() to measure elapsed time in a multi-threaded program */
static uint64_t clock_us(void) {
#ifdef TIME_UTC
    struct timespec ts;
    timespec_get(&ts, TIME_UTC);
    return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
#else
    struct timeval tt;
    gettimeofday(&tt, NULL);
    return (uint64_t)tt.tv_sec * 1000000 + tt.tv_usec;
#endif
}

/* count prime numbers up to and including N */
int main(int argc, char *argv[]) {

    uint64_t N = argc > 1 ? strtoull(argv[1], NULL, 0) : 1000000;
    int K = argc > 2 ? strtoul(argv[2], NULL, 0) : 7;

    /* size of the array of uint64_t */
    size_t NW = N / 64 + 1;

    pthread_t tid[K];
    struct sieve_args args[K];

    /* allocate at least N+1 bits */
    uint64_t *a = calloc(sizeof(*a), NW);

    uint64_t t = clock_us();

    /* initialize a with all even numbers composite */
    for (size_t i = 0; i < NW; i++) {
        a[i] = 0xAAAAAAAAAAAAAAAA;
    }
    for (int k = 0; k < K; k++) {
        args[k].a = a;
        args[k].maxp = (uint64_t)+sqrt(N);
        args[k].begin = 64 * ((uint64_t)NW * k / K);
        args[k].end = 64 * ((uint64_t)NW * (k + 1) / K);
        pthread_create(&tid[k], NULL, threadBehaviour, &args[k]);
    }
    for (int k = 0; k < K; k++) {
        pthread_join(tid[k], NULL);
    }
    t = clock_us() - t;
    printf("%f ms\n", t / 1000.0);

    unsigned long long count = 0;

    /* count prime numbers */
    a[0] &= ~(1 << 1); /* set 1 as not prime */
    a[0] |= 1 << 2; /* set 2 as prime */
    for (uint64_t i = 2; i <= N; i++) {
        if (a[i / 64] & ((uint64_t)1 << (i % 64))) {
            //printf("| %llu ", i);
            count++;
        }
    }
    //printf("\n");
    printf("%llu: %llu primes\n", (unsigned long long)N, count);
    free(a);
    return 0;
}

c中的筛eratosthenes平行化

1条答案

相关问题

热门标签

最新问答