Frequency of occurrence of hash table words

Scan a C source program, use two methods to count the frequency of occurrence of keywords in the source program, and compare the number of comparisons for each search.

(1) Use the Hash table to store the keywords that appear in the source program, and use Hash search technology to count the frequency of the keywords in the program. Use linear detection method to solve Hash conflict. Let the Hash function be:

Hash (key) = [(first letter sequence number of key) * 100 + (last letter sequence number of key)] MOD (maximum length)

(2) Use the sequence table to store the keywords that appear in the source program, and use the binary search technique to count the frequency of the keywords in the program.

#include <stdio.h>   
#include <string.h>   
#include <stdlib.h>   
#define MAX_TXT 1024   
#define MAX_WORD 47   
char Temp_word[MAX_WORD];   
//关键字哈希表数据结构类型    
struct data1   
{   
    char words[MAX_WORD];     //关键字   
    int exist;           //位置是否为空标记   
    int hashnum;        //假设不冲突计算出的key值,调试用   
}Key_words[MAX_TXT];   //关键字哈希表    
  
//扫描文件哈希表数据结构类型    
struct data2   
{   
    char words[MAX_WORD];   
    int exist;   //位置是否为空标记      
    int freq;    //频度   
    int clash_count;  //冲突次数   
}File_words[MAX_TXT];   
  
//二分查找关键字数据结构类型    
typedef struct  
{   
    char kw[MAX_WORD];//记录关键字   
    int  freq;//记录关键字出现的频率   
    int  BinCount; //记录查找的次数    
}KeyBin;   
KeyBin KeysBinList[MAX_TXT];//存放关键字的顺序表,用于二分查找   
  
  
int compare(const void* arg1, const void* arg2) {   
    KeyBin k1 = *(KeyBin*)arg1;KeyBin k2 = *(KeyBin*)arg2;   
    return strcmp(k1.kw, k2.kw);   
}   
  
void BSearch(char t[])//进行二分查找   
{   
    int low = 0;   
    int high = MAX_TXT;   
    int mid;   
    int count = 0;   
    while (low <= high)   
    {   
        mid = (low + high) / 2;   
        if (strcmp(t, KeysBinList[mid].kw) < 0) high = mid - 1;   
        else if (strcmp(t, KeysBinList[mid].kw) > 0) low = mid + 1;   
        else  
        {   
            KeysBinList[mid].freq++;   
            count++; break;   
        }   
        count++;   
    }   
    KeysBinList[mid].BinCount = count;   
}   
//-----------------------------------------------------------   
//扫描源程序,统计其中关键字出现的频率及查找次数    
void File_funcbinary()   
{   
    char choice = 0;   
    FILE* Fp_file;   
    char file_code[_MAX_FNAME] = "keyword.txt";   
    if ((Fp_file = fopen(file_code, "rt")) == NULL)   
    {   
        printf("找不到指定文件,程序结束运行!");   
        exit(0);   
    }   
    char A_word_file;   
    A_word_file = fgetc(Fp_file);   
    while (A_word_file != EOF)   
    {   
        int fst;                    //fst,lst分别保存当前单词第一个字母以及最后字母的ASCII码供后面的Hash函数用   
        int lst;   
        int count = 0;              //在按顺序读取单词的每个字符时,该变量表示了该字符在该单词中的位置   
        while (A_word_file < 65 || (A_word_file > 90 && A_word_file < 97) || A_word_file>122)   //判断是否为英文字母,不是的话,一直向后读取,直到英文字母为止   
        {   
            A_word_file = fgetc(Fp_file);   
            if (A_word_file == EOF)                                 //防止文本结尾为空格,读到最后结束读取   
            {   
                break;   
            }   
        }   
        if (A_word_file == EOF)   
            break;   
        fst = A_word_file;   
        while ((A_word_file >= 65 && A_word_file <= 90) || (A_word_file >= 97 && A_word_file <= 122))    //记录下关键字内容   
        {   
            Temp_word[count] = A_word_file;   
            count++;   
            lst = A_word_file;   
            A_word_file = fgetc(Fp_file);   
        }   
        Temp_word[count] = '\0';   
        BSearch(Temp_word);   
    }   
    printf("\n"); printf("\n");   
    printf(" \t\t\t关键字表及其出现频度:\n");   
    printf("\n");   
    printf("  关键字\t    出现频度\t   查找次数\n");   
    for (int i = 0; i < MAX_TXT; i++)   
    {   
        if (KeysBinList[i].freq != 0)   
        {   
            printf("  %-10s\t\t%d\t\t\t%d\n", KeysBinList[i].kw, KeysBinList[i].freq, KeysBinList[i].BinCount);   
        }   
    }   
  
    for (int k = 0; k < MAX_TXT; k++)                   //打印后数据清空!!!!!!!!!!!!   
    {   
  
        KeysBinList[k].BinCount = 0;   
        KeysBinList[k].freq = 0;   
    }   
    fclose(Fp_file);   
  
}   
  
//-----------------------------------------------------------   
//创建关键字的顺序表    
void creatbinary(void)   
{   
    FILE* fp;   
    int i = 0;   
  
    fp = fopen("keyword.txt", "r");   
    while (!feof(fp))   
        fscanf(fp, "%s ", KeysBinList[i++].kw);   
    fclose(fp);   
    qsort(KeysBinList, MAX_TXT, sizeof(KeyBin), compare);   
    File_funcbinary();   
}   
//-----------------------------------------------------------   
void ListHash()                                                     //按照Hash函数计算出所有关键字在Hash表中的位置   
{   
    FILE* Fp_key;   
    for (int i = 0; i < MAX_TXT; i++)                                       //初始化Hash表   
    {   
        Key_words[i].exist = 0;   
        Key_words[i].hashnum = 0;   
    }   
    printf("\n");   
    printf("\n");   
    printf("\n");   
    if ((Fp_key = fopen("keyword.txt", "rt")) == NULL)   
    {   
        printf("    对不起,找不到该文件,程序结束运行!\n");   
        exit(0);   
    }   
    char A_key_words = fgetc(Fp_key);   
    while (A_key_words != EOF)                                  //将所有关键字保存到Hash表中   
    {   
        int locate;                                             //locate保存该单词计算出的在Hash表中的位置   
        int locate1;   
        int first;                                              //first,last分别保存当前单词第一个字母以及最后字母的ASCII码供后面的Hash函数用   
        int last;   
        int count = 0;                                          //在按顺序读取单词的每个字符时,该变量表示了该字符在该单词中的位置   
        while (A_key_words < 65 || (A_key_words > 90 && A_key_words < 97) || A_key_words>122)   //判断是否为英文字母,不是的话,一直向后读取,直到英文字母为止   
        {   
            A_key_words = fgetc(Fp_key);   
            if (A_key_words == EOF)   
            {   
                break;   
            }   
        }   
        if (A_key_words == EOF)   
            break;   
        first = A_key_words;                                                //保存下关键字第一个字母的ASCII码   
        while ((A_key_words >= 65 && A_key_words <= 90) || (A_key_words >= 97 && A_key_words <= 122))    //记录下关键字内容   
        {   
            Temp_word[count] = A_key_words;   
            count++;   
            last = A_key_words;   
            A_key_words = fgetc(Fp_key);   
        }   
        Temp_word[count] = '\0';                                            //字符串结尾符号       
  
        locate = locate1 = (first * 100 + last) % MAX_TXT;      //Hash算法  locate1为没有冲突是改关键字应该在的位置   
        if (Key_words[locate].exist == 0)                   //该位置如果还没有存入关键字就把当前的关键字存在这个位置   
        {   
            strcpy(Key_words[locate].words, Temp_word);   
            Key_words[locate].exist = 1;   
            Key_words[locate].hashnum = locate1;   
        }   
        else                                                //当前位置如果有关键字就解决Hash冲突,一直向后移直到没有存关键字的位置存入   
        {   
            while (Key_words[locate].exist == 1)   
            {   
                locate++;   
                locate = locate % MAX_TXT;   
            }   
            strcpy(Key_words[locate].words, Temp_word);   
            Key_words[locate].exist = 1;   
            Key_words[locate].hashnum = locate1;   
        }   
    }   
    printf("\n");   
    for (int j = 0; j < MAX_TXT; j++)   
    {   
        if (Key_words[j].exist == 1)   
        {   
            printf("  %-10s\t\t\t%d\t\t\t%d\n", Key_words[j].words, j, Key_words[j].hashnum);   
        }   
    }   
    fclose(Fp_key);   
}   
  
//-----------------------------------------------------------   
// 扫描源程序,按照哈希查找统计其中关键字出现的频率、哈希表中的位置及其冲突次数    
void File_funcHash()   
{   
    char choice = 0;   
    for (int i = 0; i < MAX_TXT; i++)                                       //初始化Hash表   
    {   
        File_words[i].exist = 0;   
        File_words[i].clash_count = 0;   
    }   
    FILE* Fp_file;   
    char file_code[_MAX_FNAME] = "keyword.txt";   
    printf("\n");   
    printf("\n");   
    if ((Fp_file = fopen(file_code, "r")) == NULL)   
    {   
        printf("找不到指定文件,程序结束运行!");   
        exit(0);   
    }   
    char A_word_file;   
    A_word_file = fgetc(Fp_file);   
    while (A_word_file != EOF)   
    {   
        int fst;                    //fst,lst分别保存当前单词第一个字母以及最后字母的ASCII码供后面的Hash函数用   
        int lst;   
        int count = 0;              //在按顺序读取单词的每个字符时,该变量表示了该字符在该单词中的位置   
        while (A_word_file < 65 || (A_word_file > 90 && A_word_file < 97) || A_word_file>122)   //判断是否为英文字母,不是的话,一直向后读取,直到英文字母为止   
        {   
            A_word_file = fgetc(Fp_file);   
            if (A_word_file == EOF)                                 //防止文本结尾为空格,读到最后结束读取   
            {   
                break;   
            }   
        }   
        if (A_word_file == EOF)   
            break;   
        fst = A_word_file;   
        while ((A_word_file >= 65 && A_word_file <= 90) || (A_word_file >= 97 && A_word_file <= 122))    //记录下关键字内容   
        {   
            Temp_word[count] = A_word_file;   
            count++;   
            lst = A_word_file;   
            A_word_file = fgetc(Fp_file);   
        }   
        Temp_word[count] = '\0';   
        int point;   
        int Lct;   
        point = Lct = (fst * 100 + lst) % MAX_TXT;              //计算出文件读取时当前单词按Hash函数计算出在Hash表中的位置   
        int target = 0;                         //判断当前字母串是否是关键字,0表示不是,1表示是   
        if (strcmp(Key_words[Lct].words, Temp_word) == 0)   
        {   
            target = 1;   
        }   
        else  
        {   
            int temper = 0;   
            while (temper <= MAX_TXT && strcmp(Key_words[Lct].words, Temp_word) != 0 && Key_words[Lct].words)   
            {   
                temper++;   
                Lct++;   
                Lct = Lct % MAX_TXT;   
            }   
            if (strcmp(Key_words[Lct].words, Temp_word) == 0)   
            {   
                target = 1;   
            }   
        }   
        if (target == 1)   
        {   
            if (File_words[point].exist == 0)   
            {   
                strcpy(File_words[point].words, Temp_word);   
                File_words[point].exist = 1;   
                File_words[point].freq = 1;   
            }   
            else  
            {   
                if (strcmp(File_words[point].words, Temp_word) == 0)   
                {   
                    File_words[point].freq++;   
                }   
                else  
                {   
                    int change = 0;;   
                    while (strcmp(File_words[point].words, Temp_word) != 0 && File_words[point].exist == 1)   
                    {   
                        point++;   
                        change++;   
                        point = point % MAX_TXT;   
                    }   
                    File_words[point].clash_count += change;   
                    if (strcmp(File_words[point].words, Temp_word) == 0)   //冲突了的关键词   
                    {   
                        File_words[point].freq++;   
                    }   
                    else  
                    {   
                        strcpy(File_words[point].words, Temp_word);     //冲突了的关键词   
                        File_words[point].exist = 1;   
                        File_words[point].freq = 1;   
                    }   
                }   
            }   
        }   
    }   
    printf("\n"); printf("\n");   
    printf(" \t\t\t关键字表及其出现频度:\n");   
    printf("\n");   
    printf("  关键字\t    出现频度\t    在Hash表中的位置\t\t   冲突次数\n");   
    for (int i = 0; i < MAX_TXT; i++)   
    {   
        if (File_words[i].exist = 1 && File_words[i].freq != 0)   
        {   
            printf("  %-10s\t\t%d\t\t\t%d\t\t\t%d\n", File_words[i].words, File_words[i].freq, i, File_words[i].clash_count);   
        }   
    }   
    int allclash = 0;   
    int allkey = 0;   
    for (int j = 0; j < MAX_TXT; j++)   
    {   
        allclash += File_words[j].clash_count;   
        allkey += File_words[j].freq;   
    }   
    printf("\n");   
    printf("  关键字冲突总次数\t  关键字总数为\n");   
    printf("\t %d \t\t\t%d \t", allclash, allkey);   
    printf("\n");   
    printf("\n");   
    for (int k = 0; k < MAX_TXT; k++)                   //打印后数据清空!!!!!!!!!!!!   
    {   
        File_words[k].exist = 0;   
        File_words[k].clash_count = 0;   
        File_words[k].freq = 0;   
    }   
    fclose(Fp_file);   
  
}   
  
int main(void)   
{   
    int start = 1;   
    int choose;   
    int i;   
    printf("\n");   
  
    while (start != 0)   
    {   
        printf("\t*********扫描源程序,利用查找技术统计关键字频度**********\n");   
        printf("\t1---------Hash查找统计:\n");   
        printf("\t2---------二分查找统计:\n");   
        printf("\t0---------结束\n");   
  
        scanf("%d", &choose);   
        switch (choose)   
        {   
        case 1:ListHash();  //读取保存关键字的文件并将所有关键字存入Hash表中   
            File_funcHash(); break;    //Hash查找统计   
        case 2:creatbinary(); break;      //二分查找统计   
        case 0:exit(0);   
        }   
    }   
    return 0;   
}  Code language: C++ (cpp)
 Unless otherwise stated, the articles are original.
 Title of this article:Frequency of occurrence of hash table words
 Hyperlink to this article:https://manwish.cn/en/article/frequency-of-occurrence-of-hash-table-words.html

Leave a Comment