cs50 pset5拼写优化

b1payxdu  于 2023-05-06  发布在  其他
关注(0)|答案(3)|浏览(136)

我已经完成了拼写测试并通过了所有的检查。但我还是对你的表演感到不安。我尽了最大的努力进行研究和测试,但我的实现比员工的慢10-20%。如何改进我的代码?

// Implements a dictionary's functionality

#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include "dictionary.h"

// Represents a node in a hash table
typedef struct node
{
    char word[LENGTH + 1];
    struct node *next;
}
node;
//Prototypes
unsigned int hash(const char *word);

// TODO: Choose number of buckets in hash table
const unsigned int N = 187751; //odd prime number bigger than word count, because math

// Hash table
node *table[N]; //may need to use calloc

//word count
int word_count = 0;

// Returns true if word is in dictionary, else false
bool check(const char *word)
{
    // TODO
    //debug
    //int hashcode = hash(word);
    //printf("bucket: %i, word: %s\n", hash(word), table[hash(word)]->word);
    for (node *tmp = table[hash(word)]; tmp != NULL; tmp = tmp->next) //iterate through bucket
    {
        if (strcasecmp(tmp->word, word) == 0) //compare strings
        {
            return true;
        }
    }
    return false;
}

// Hashes word to a number
unsigned int hash(const char *word)
{
    // TODO: Improve this hash function
    //Used Data Structures youtube course of Dr. Rob Edwards from San Diego State University mostly
    int hash = 0;
    /*int w = 0;
    //convert to upper case
    while (word[w] != '\0')
    {
        word[w] = toupper(word[w]);
        w++;
    }*/
    for (int i = 0; i <= strlen(word); i++)
    {
        hash = (31 * hash + toupper(word[i])) % N; //31 because math
        //check hash size
        /*if (hash >= N)
            {
                printf("incorrect hash!");
            }*/
    }
    return hash;
}

// Loads dictionary into memory, returning true if successful, else false
bool load(const char *dictionary)
{
    // TODO
    //open dictionary file
    FILE *dict = fopen(dictionary, "r");
    if (dict == NULL)
    {
        //printf("Could not open file.\n");
        return false;
    }
    //create temp string for word
    char *word = malloc(LENGTH + 1);
    if (word == NULL)
    {
        return false;
    }
    //read words from dictionary and write to hash table
    while (fscanf(dict, "%s", word) != EOF)
    {
        //node *tmp_word = NULL;
        node *n = malloc(sizeof(node));
        if (n == NULL)
        {
            return false;
        }
        int pos = hash(word); //run hash to find the bucket in table
        strcpy(n->word, word); //write word to temp node
        if (table[pos] == NULL) //check if node is first in bucket (may need to use calloc to create a table)
        {
            n->next = NULL;
        }
        else //if not the first, write node to the head
        {
            n->next = table[pos]; //reference first node
        }
        table[pos] = n; //write node to table
        word_count++; //count new word
    }
    fclose(dict);
    free(word);
    //debug
    /*int j = 0;
    while (j <= 11)
    {
        for (node *tmp = table[j]; tmp != NULL; tmp = tmp->next) //делаем ноду; пока не дойдем до последней; переходим к следующей
                {
                    printf("%s\n", tmp->word); //забираем значение ноды

                }
        printf("Bucket number: %i\n", j);
        j++;
    }*/
    //printf("word count:%i\n", word_count);
    return true;
}

// Returns number of words in dictionary if loaded, else 0 if not yet loaded
unsigned int size(void)
{
    return word_count;
}

// Unloads dictionary from memory, returning true if successful, else false
bool unload(void)
{
    // TODO
    for (int i = 0; i <= N; i++)
    {
        while (table[i] != NULL)
        {
            node *tmp = table[i]->next; //делаем ноду и переходим на следующую ячейку
            free(table[i]); //освобождаем текущую
            table[i] = tmp; //начинаем list со второй ячейки
        }
    }
    return true;
}

我的成果:

WORDS MISSPELLED:     955
WORDS IN DICTIONARY:  143091
WORDS IN TEXT:        17756
TIME IN load:         0.04
TIME IN check:        0.02
TIME IN size:         0.00
TIME IN unload:       0.01
TIME IN TOTAL:        0.07

CS50工作人员的结果:

WORDS MISSPELLED:     955
WORDS IN DICTIONARY:  143091
WORDS IN TEXT:        17756
TIME IN load:         0.02
TIME IN check:        0.02
TIME IN size:         0.00
TIME IN unload:       0.01
TIME IN TOTAL:        0.06

我已经玩过水桶大小,无济于事((
Pset 5说明:https://cs50.harvard.edu/x/2022/psets/5/speller/
我还包括dictionary.h,如果你需要的话。Check正在从另一个文本文件中获取字符串,而不是从加载中使用的文本文件中获取字符串。

#ifndef DICTIONARY_H
#define DICTIONARY_H

#include <stdbool.h>

// Maximum length for a word
// (e.g., pneumonoultramicroscopicsilicovolcanoconiosis)
#define LENGTH 45

// Prototypes
bool check(const char *word);
unsigned int hash(const char *word);
bool load(const char *dictionary);
unsigned int size(void);
bool unload(void);

#endif // DICTIONARY_H

我的问题在Craig Estey中得到了完美的回答。他的代码运行速度比我的快,比cs50员工的更快:

较大的文本:

我学到了很多,我真的很棒。这是我在这里的第一个问题,在SO,我从来没有预料到它是如此乐于助人和友好的地方。

apeeds0o

apeeds0o1#

几个问题:

  1. while (fscanf(dict, "%s", word) != EOF)错误。您想要:while (fscanf(dict, "%s", word) == 1)
    1.更快地将给定的单词 * 存储 * 为 * 大写 *。(即)我们对每个单词只执行toupper * 一次 *。
  2. check函数将更快,因为它可以使用strcmp而不是strcasecmp [这是 * 慢 *]
    1.如果我们可以将字段添加到node结构体中,我们就可以让它保存一个哈希值。这可以加快check函数的速度(请参阅下面代码中的NODEHASH)。
    1.为 each 节点执行malloc是缓慢/浪费的。pooled/竞技场/slab分配器可以更快(请参阅下面代码中的FASTALLOC)。同样,if 我们可以向node结构体添加一个字段。
    下面的代码中还记录了一些其他错误(请参阅“注意”注解)。我使用了cpp条件来表示旧的与新代码:
#if 0
// old code
#else
// new code
#endif

#if 1
// new code
#endif

下面是重构后的代码。我已经编译了它,但没有测试它:

// Implements a dictionary's functionality

#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>

//#include "dictionary.h"

// NOTE/BUG: dictionary.h was _not_ posted, so we have to guess
#define LENGTH      20

#ifndef NODEHASH
#define NODEHASH    1
#endif

#ifndef FASTALLOC
#define FASTALLOC   1
#endif

// Represents a node in a hash table
typedef struct node {
    char word[LENGTH + 1];
    struct node *next;
#if NODEHASH
    unsigned int hash;
#endif
#if FASTALLOC
    int alloc;
#endif
} node;

//Prototypes
unsigned int hash(const char *word);

// TODO: Choose number of buckets in hash table
// odd prime number bigger than word count, because math
// NOTE/BUG: this doesn't work too well for C (may be okay in C++) when used
// as dimension for table below
// NOTE/BUG: enum may be faster in calculations
#if 0
const unsigned int N = 187751;
#else
enum {
    N = 187751
};
#endif

// Hash table
node *table[N];                         // may need to use calloc

//word count
int word_count = 0;

// Returns true if word is in dictionary, else false
bool
check(const char *word)
{
    // TODO
    // debug
    // int hashcode = hash(word);
    // printf("bucket: %i, word: %s\n", hash(word), table[hash(word)]->word);

#if 1
    unsigned int hval = hash(word);
#endif

    // iterate through bucket
    for (node *tmp = table[hval]; tmp != NULL; tmp = tmp->next) {
#if NODEHASH
        // check hash value first -- limits number of str*cmp calls
        if (tmp->hash != hval)
            continue;
#endif

        // compare strings
// NOTE/BUG: strcasecmp is _slow_ and if we _store_ uppercase we don't need it
#if 0
        if (strcasecmp(tmp->word, word) == 0) {
            return true;
        }
#else
        if (strcmp(tmp->word, word) == 0)
            return true;
#endif
    }

    return false;
}

// Hashes word to a number
unsigned int
hash(const char *word)
{
    // TODO: Improve this hash function
    // Used Data Structures youtube course of Dr. Rob Edwards from San Diego State University mostly
    int hash = 0;

    /* int w = 0; //convert to upper case while (word[w] != '\0') { word[w] = toupper(word[w]); w++; } */

// NOTE/BUG: _without_ optimization this is O(n^2) instead of O(n) because it
// calls strlen on _each_ loop iteration
#if 0
    for (int i = 0; i <= strlen(word); i++) {
        hash = (31 * hash + toupper(word[i])) % N;  // 31 because math
        // check hash size
        /* if (hash >= N) { printf("incorrect hash!"); } */
    }
#else
    for (int chr = *word++;  chr != 0;  chr = *word++)
        hash = (31 * hash + chr) % N;   // 31 because math
#endif

    return hash;
}

#if FASTALLOC
node *freelist = NULL;
int freecount = 0;

// adjust this value to suit
#define ARENASIZE       1000

// newnode -- allocate from pool
node *
newnode(void)
{
    node *cur;

    while (1) {
        // get node from free list cache
        cur = freelist;

        // use the cached node if possible
        if (cur != NULL) {
            freelist = cur->next;
            break;
        }

        // allocate a new arena
        freelist = malloc(sizeof(node) * ARENASIZE);

        // out of memory
        if (freelist == NULL)
            break;

        // link all nodes in the arena
        node *prev = NULL;
        for (cur = freelist;  cur < &freelist[ARENASIZE];  ++cur) {
            cur->alloc = 0;
            cur->next = prev;
            prev = cur;
        }

        // mark the arena head
        freelist->alloc = 1;
    }

    return cur;
}
#endif

// Loads dictionary into memory, returning true if successful, else false
bool
load(const char *dictionary)
{
    // TODO
    // open dictionary file
    FILE *dict = fopen(dictionary, "r");

    if (dict == NULL) {
        // printf("Could not open file.\n");
        return false;
    }
    // create temp string for word
// NOTE/BUG: no need to malloc this (and it is slightly slower)
#if 0
    char *word = malloc(LENGTH + 1);
    if (word == NULL) {
        return false;
    }
#else
    char word[LENGTH + 1];
#endif

    // read words from dictionary and write to hash table
// NOTE/BUG: this scanf is wrong (and may loop too much)
#if 0
    while (fscanf(dict, "%s", word) != EOF) {
#else
    while (fscanf(dict, "%s", word) == 1) {
#endif
        // node *tmp_word = NULL;
#if FASTALLOC
        node *n = newnode();
#else
        node *n = malloc(sizeof(node));
#endif
        if (n == NULL)
            return false;

// NOTE/FIX: convert to uppercase _once_
        int idx;
        for (idx = 0;  word[idx] != 0;  ++idx)
            n->word[idx] = toupper(word[idx] & 0xFF);
        n->word[idx] = 0;

        // run hash to find the bucket in table
// NOTE/BUG: pos should be unsigned
#if 0
        int pos = hash(word);
#else
        unsigned int pos = hash(n->word);
#endif

#if 0
        strcpy(n->word, word);          // write word to temp node
#endif

#if NODEHASH
        n->hash = pos;
#endif

// NOTE/BUG: no need to check table[pos]
#if 0
        // check if node is first in bucket
        // (may need to use calloc to create a table)
        if (table[pos] == NULL)
        {
            n->next = NULL;
        }

        // if not the first, write node to the head
        else
        {
            n->next = table[pos];       // reference first node
        }
#else
        n->next = table[pos];
#endif

        table[pos] = n;                 // write node to table
        word_count++;                   // count new word
    }

    fclose(dict);
#if 0
    free(word);
#endif

    // debug
    /* int j = 0; while (j <= 11) { for (node *tmp = table[j]; tmp != NULL; tmp = tmp->next) //делаем ноду; пока не дойдем до последней; переходим к следующей { printf("%s\n", tmp->word); //забираем значение ноды

       } printf("Bucket number: %i\n", j); j++; } */
    // printf("word count:%i\n", word_count);
    return true;
}

// Returns number of words in dictionary if loaded, else 0 if not yet loaded
unsigned int
size(void)
{
    return word_count;
}

// Unloads dictionary from memory, returning true if successful, else false
#if ! FASTALLOC
bool
unload(void)
{

    // TODO
// NOTE/BUG: this goes one beyond the table end (i.e. want: i < N)
#if 0
    for (int i = 0; i <= N; i++) {
        while (table[i] != NULL) {
            node *tmp = table[i]->next; // делаем ноду и переходим на следующую ячейку

            free(table[i]);             // освобождаем текущую
            table[i] = tmp;             // начинаем list со второй ячейки
        }
    }
#else
    for (int i = 0;  i < N;  ++i) {
        node *cur = table[i];
        if (cur == NULL)
            continue;

        node *next;

        for (;  cur != NULL;  cur = next) {
            next = cur->next;
            free(cur);
        }
#endif

    return true;
}
#endif

// Unloads dictionary from memory, returning true if successful, else false
#if FASTALLOC
bool
unload(void)
{
    node *cur;
    node *next;

    // remove all nodes from the table
    for (int i = 0;  i < N;  ++i) {
        cur = table[i];
        if (cur == NULL)
            continue;

        // put all nodes in hash table back on [cached] free list
        for (;  cur != NULL;  cur = next) {
            next = cur->next;
            cur->next = freelist;
            freelist = cur;
        }

        table[i] = NULL;
    }

    cur = freelist;
    freelist = NULL;

    // trim free list to allocation/arena heads
    for (;  cur != NULL;  cur = next) {
        next = cur->next;
        if (cur->alloc) {
            cur->next = freelist;
            freelist = cur;
        }
    }

    // free the allocated nodes
    cur = freelist;
    freelist = NULL;
    for (;  cur != NULL;  cur = next) {
        next = cur->next;
        free(cur);
    }

    return true;
}
#endif

下面是上面的一个清理版本(我通过unifdef运行它):

// Implements a dictionary's functionality

#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>

//#include "dictionary.h"

// NOTE/BUG: dictionary.h was _not_ posted, so we have to guess
#define LENGTH      20

// Represents a node in a hash table
typedef struct node {
    char word[LENGTH + 1];
    struct node *next;
    unsigned int hash;
    int alloc;
} node;

//Prototypes
unsigned int hash(const char *word);

// TODO: Choose number of buckets in hash table
// odd prime number bigger than word count, because math
enum {
    N = 187751
};

// Hash table
node *table[N];                         // may need to use calloc

//word count
int word_count = 0;

// Returns true if word is in dictionary, else false
bool
check(const char *word)
{
    // TODO
    // debug
    // int hashcode = hash(word);
    // printf("bucket: %i, word: %s\n", hash(word), table[hash(word)]->word);

    unsigned int hval = hash(word);

    // iterate through bucket
    for (node *tmp = table[hval]; tmp != NULL; tmp = tmp->next) {
        // check hash value first -- limits number of str*cmp calls
        if (tmp->hash != hval)
            continue;

        // compare strings
        if (strcmp(tmp->word, word) == 0)
            return true;
    }

    return false;
}

// Hashes word to a number
unsigned int
hash(const char *word)
{
    // TODO: Improve this hash function
    // Used Data Structures youtube course of Dr. Rob Edwards from San Diego State University mostly
    int hash = 0;

    /* int w = 0; //convert to upper case while (word[w] != '\0') { word[w] = toupper(word[w]); w++; } */

    for (int chr = *word++;  chr != 0;  chr = *word++)
        hash = (31 * hash + chr) % N;   // 31 because math

    return hash;
}

node *freelist = NULL;
int freecount = 0;

// adjust this value to suit
#define ARENASIZE       1000

// newnode -- allocate from pool
node *
newnode(void)
{
    node *cur;

    while (1) {
        // get node from free list cache
        cur = freelist;

        // use the cached node if possible
        if (cur != NULL) {
            freelist = cur->next;
            break;
        }

        // allocate a new arena
        freelist = malloc(sizeof(node) * ARENASIZE);

        // out of memory
        if (freelist == NULL)
            break;

        // link all nodes in the arena
        node *prev = NULL;
        for (cur = freelist;  cur < &freelist[ARENASIZE];  ++cur) {
            cur->alloc = 0;
            cur->next = prev;
            prev = cur;
        }

        // mark the arena head
        freelist->alloc = 1;
    }

    return cur;
}

// Loads dictionary into memory, returning true if successful, else false
bool
load(const char *dictionary)
{
    // TODO
    // open dictionary file
    FILE *dict = fopen(dictionary, "r");

    if (dict == NULL) {
        // printf("Could not open file.\n");
        return false;
    }
    // create temp string for word
// NOTE/BUG: no need to malloc this (and it is slightly slower)
    char word[LENGTH + 1];

    // read words from dictionary and write to hash table
// NOTE/BUG: this scanf is wrong (and may loop too much)
    while (fscanf(dict, "%s", word) == 1) {
        // node *tmp_word = NULL;
        node *n = newnode();
        if (n == NULL)
            return false;

// NOTE/FIX: convert to uppercase _once_
        int idx;
        for (idx = 0;  word[idx] != 0;  ++idx)
            n->word[idx] = toupper(word[idx] & 0xFF);
        n->word[idx] = 0;

        // run hash to find the bucket in table
// NOTE/BUG: pos should be unsigned
        unsigned int pos = hash(n->word);

        n->hash = pos;

// NOTE/BUG: no need to check table[pos]
        n->next = table[pos];

        table[pos] = n;                 // write node to table
        word_count++;                   // count new word
    }

    fclose(dict);

    // debug
    /* int j = 0; while (j <= 11) { for (node *tmp = table[j]; tmp != NULL; tmp = tmp->next) //делаем ноду; пока не дойдем до последней; переходим к следующей { printf("%s\n", tmp->word); //забираем значение ноды

       } printf("Bucket number: %i\n", j); j++; } */
    // printf("word count:%i\n", word_count);
    return true;
}

// Returns number of words in dictionary if loaded, else 0 if not yet loaded
unsigned int
size(void)
{
    return word_count;
}

// Unloads dictionary from memory, returning true if successful, else false
bool
unload(void)
{
    node *cur;
    node *next;

    // remove all nodes from the table
    for (int i = 0;  i < N;  ++i) {
        cur = table[i];
        if (cur == NULL)
            continue;

        // put all nodes in hash table back on [cached] free list
        for (;  cur != NULL;  cur = next) {
            next = cur->next;
            cur->next = freelist;
            freelist = cur;
        }

        table[i] = NULL;
    }

    cur = freelist;
    freelist = NULL;

    // trim free list to allocation/arena heads
    for (;  cur != NULL;  cur = next) {
        next = cur->next;
        if (cur->alloc) {
            cur->next = freelist;
            freelist = cur;
        }
    }

    // free the allocated nodes
    cur = freelist;
    freelist = NULL;
    for (;  cur != NULL;  cur = next) {
        next = cur->next;
        free(cur);
    }

    return true;
}

更新#2:

我已经提交了我的版本。只是想更好地理解这个主题。- 梅奥特
好的,正因为如此,我已经制作了一个最终版本,这是我为速度所做的最大努力。如果你还在做的话,我会做更多的重构。
注意事项...
为每个节点执行 singlemalloc是浪费的。malloc相对较慢。请注意,在您的基准测试时间中,主要是load阶段比staff实现慢。
此外,malloc在内存中放置了一个 hidden 结构体,该结构体紧接在它返回的指针之前,用于跟踪分配。此结构占用空间,对于像节点结构这样的小分配,额外的内存开销可能很大。
最好调用malloc并在一个连续的块或数组中分配大量的node结构。进一步的分配可以来自该块内。这类似于“slab allocation”、“object pools”或“memory pools”:

  1. https://en.wikipedia.org/wiki/Slab_allocation
  2. https://en.wikipedia.org/wiki/Object_pool_pattern
  3. https://en.wikipedia.org/wiki/Memory_pool
    在下面的代码中,节点以1000为单位进行分配。这将malloc调用的数量从100,000减少到大约100。这个版本比我的上一个分配器提高了速度,因为它使用了一个额外的“段”(又名“竞技场”)结构来控制分配。它不再需要将每个节点结构体预链接到一个“自由列表”中,或者跟踪一个节点是否是从malloc接收的给定内存中的第一个节点。
    阅读字典时无需执行toupper/tolower。因为字典保证全部小写,每行只有一个单词,所以使用fgets读取字典会更快。我们可以将字典单词 * 直接 * 读入给定node结构体的word字段[* 如果 * 我们在执行fgets之前执行节点分配]。
    在计算哈希值时,我们不需要在每次迭代中都执行% N。由于模算术的性质,我们可以在最后做一次余数运算。
    我们还可以将hash值存储到node结构中。然后,在check中,我们可以比较哈希值并跳过一些[昂贵的] strcmp调用。在您的代码中,由于N非常大,大多数哈希桶只有一个或两个条目,因此这种加速效果可能很小[并且可能会稍微慢一些]。
    check中,最好将参数word复制到一个临时缓冲区中,使用小写 once。然后,我们可以使用更快的strcmp而不是更慢的strcasecmp。无论我们是否将哈希值存储在节点结构中,这都有效。同样,由于N值很大,这种加速效果可能有限。
    下面是重构后的代码:
// fix2.c -- Implements a dictionary's functionality

#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>

#include "dictionary.h"

#if _USE_ZPRT_
#define dbgprt(_fmt...)             printf(_fmt)
#else
#define dbgprt(_fmt...)             do { } while (0)
#endif

#ifndef NODEHASH
#define NODEHASH    1
#endif

#ifndef FASTNODE
#define FASTNODE    1
#endif

// Represents a node in a hash table
typedef struct node {
    char word[LENGTH + 1 + 1];
    struct node *next;
#if NODEHASH
    unsigned int hash;
#endif
} node;

//Prototypes
unsigned int hash(const char *word);

// TODO: Choose number of buckets in hash table
// odd prime number bigger than word count, because math
#ifndef N
#define N 187751
#endif

// Hash table
node *table[N];                         // may need to use calloc

// word count
int word_count = 0;

// 31 because math
// modulus only really needed to clip hash to table index
// so do it _once_ by caller of hash/hash2
#if 0
#define HASH \
    hash = (31 * hash + chr) % N
#else
#define HASH \
    hash = (31 * hash) + chr
#endif

// Hashes word to a number and copies lowercase chars to output buffer
unsigned int
hash2(char *dest,const char *word)
{
    unsigned int hash = 0;

    for (int chr = *word++;  chr != 0;  chr = *word++) {
        chr = tolower((unsigned char) chr);
        HASH;
        *dest++ = chr;
    }

    *dest = 0;

    return hash;
}

// Returns true if word is in dictionary, else false
bool
check(const char *word)
{
    char tmp[LENGTH + 10];

    unsigned int hval = hash2(tmp,word);
    unsigned int hidx = hval % N;

    // iterate through bucket
    for (node *cur = table[hidx];  cur != NULL;  cur = cur->next) {
        // check hash value first -- limits number of str*cmp calls
#if NODEHASH
        if (cur->hash != hval)
            continue;
#endif

        if (strcmp(cur->word, tmp) == 0)
            return true;
    }

    return false;
}

// Hashes word to a number
unsigned int
hash(const char *word)
{
    unsigned int hash = 0;

    for (int chr = *word++;  chr != 0;  chr = *word++)
        HASH;

    return hash;
}

// adjust this value to suit
#ifndef ARENASIZE
#define ARENASIZE       1000
#endif

typedef struct seg seg_t;
struct seg {
    seg_t *seg_next;                // next segment
    int seg_count;                  // number of used nodes in this segment
    node seg_node[ARENASIZE];       // nodes in this segment
};

node *nodelist;                     // list of freed nodes
seg_t *seglist;                     // head of linked list of segments

// newnode -- allocate from pool
node *
newnode(void)
{
    seg_t *seg;
    node *cur;

    while (1) {
        // get node from free node cache
        // use the cached node if possible
        // this only happens if freenode were called [which it isn't]
        cur = nodelist;
        if (cur != NULL) {
            nodelist = cur->next;
            break;
        }

        // try to allocate from current segment
        seg = seglist;
        if (seg != NULL) {
            if (seg->seg_count < ARENASIZE) {
                cur = &seg->seg_node[seg->seg_count++];
                break;
            }
        }

        // allocate a new segment
        seg = malloc(sizeof(*seg));

        // out of memory
        if (seg == NULL)
            break;

        // mark segment as completely unused
        seg->seg_count = 0;

        // attach to segment list
        seg->seg_next = seglist;
        seglist = seg;
    }

    return cur;
}

// freenode -- free a node
void
freenode(node *cur)
{

#if FASTNODE
    cur->next = nodelist;
    nodelist = cur;
#else
    free(cur);
#endif
}

// segunload -- release all segments
void
segunload(void)
{
    seg_t *seg;
    seg_t *next;

    seg = seglist;
    seglist = NULL;

    for (;  seg != NULL;  seg = next) {
        next = seg->seg_next;
        free(seg);
    }
}

// Unloads dictionary from memory, returning true if successful, else false
bool
unload(void)
{
    node *cur;

    // remove all nodes from the table
    for (int i = 0;  i < N;  ++i) {
        cur = table[i];

        // put all nodes in hash table back on [cached] free list
        // NOTE: not really necessary
#if FASTNODE
        word_count = 0;
        cur = NULL;
#else
        node *next;
        for (;  cur != NULL;  cur = next, --word_count) {
            next = cur->next;
            freenode(cur);
        }
#endif

        table[i] = cur;
    }

    segunload();

    return true;
}

// Loads dictionary into memory, returning true if successful, else false
bool
load(const char *dictionary)
{

    // open dictionary file
    FILE *dict = fopen(dictionary, "r");
    if (dict == NULL)
        return false;

    // read words from dictionary and write to hash table
    while (1) {
#if FASTNODE
        node *n = newnode();
#else
        node *n = malloc(sizeof(*n));
#endif
        if (n == NULL)
            return false;

        char *word = n->word;

        if (fgets(word,sizeof(n->word),dict) == NULL) {
            freenode(n);
            break;
        }

        // strip newline
#if 0
        word += strlen(word);
        if (--word >= n->word) {
            if (*word == '\n');
                *word = 0;
        }
#else
        word = strchr(word,'\n');
        if (word != NULL)
            *word = 0;
#endif

        // run hash to find the bucket in table
        unsigned int pos = hash(n->word);

#if NODEHASH
        n->hash = pos;
#endif

        // put node in hash table
        pos %= N;
        n->next = table[pos];
        table[pos] = n;

        word_count++;
    }

    fclose(dict);

    // DEBUG: show the bucket counts
#if _USE_ZPRT_
    node *cur;
    int mincount = 0;
    int maxcount = 0;
    int buckcount = 0;
    int totcount = 0;

    for (int i = 0;  i < N;  ++i) {
        cur = table[i];
        if (cur == NULL)
            continue;

        int count = 0;
        for (;  cur != NULL;  cur = cur->next)
            ++count;

        printf("load: BUCKET/%d count=%d\n",i,count);

        if (i == 0) {
            mincount = count;
            maxcount = count;
        }

        if (count < mincount)
            mincount = count;
        if (count > maxcount)
            maxcount = count;
        totcount += count;

        ++buckcount;
    }

    printf("load: TOTAL buckcnt=%d/%d mincount=%d maxcount=%d avgcount=%d/%d\n",
        buckcount,N,mincount,maxcount,totcount / buckcount,totcount / N);
#endif

    return true;
}

// Returns number of words in dictionary if loaded, else 0 if not yet loaded
unsigned int
size(void)
{
    return word_count;
}
eni9jsuy

eni9jsuy2#

不确定这是否仍然是一个问题,但大部分性能都来自哈希函数。
而不是调用像toUpper toLower这样的持久字符串函数,你应该依赖于一个快速简单的哈希函数:

// Hashes word to a number
unsigned int hash(const char *word)
{
    unsigned int hash = 0;
    for (int i = 0, n = strlen(word); i < n; i++)
    {
        hash = (hash << 2) ^ word[i];
    }
    return hash % N;
}

N为65535时工作正常

r8uurelv

r8uurelv3#

这里有另一个更简单的方法来大大减少总时间,你只需要做一个更快的哈希函数。

unsigned int hash(const char *word)
{
int hash = 0;
for(int i = 0; i<strlen(word); i++)
{
    if(isalpha(word[i]))
    {
        hash = round(toupper(word[i]) * pow(PI, i));
    }
}
hash = hash % N;
return hash;
}

我在这里使用的N值是786433,这是一个大素数。输出如下所示。

相关问题