C中的Tokenizer未显示任何输出

jtw3ybtb  于 2023-05-22  发布在  其他
关注(0)|答案(1)|浏览(103)

我正在写一个简单的tokenizer来自学C语言,并且在很多似乎是未定义行为的情况下挣扎。代码似乎很好,我不知道问题出在哪里。当我运行程序时,在控制台中看不到任何输出。编译器根本不会抛出错误。
这是我尝试标记的简单文本文件:

1
2.34

xxy =  5e6

我下面的代码没有输出到控制台:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <ctype.h>
// #include "hexal.h"

#define HX_VERSION "0.0.1"

typedef enum
{
    TOK_EOF,                    // End of file      0
    TOK_ILLEGAL,                // Illegal token    1
    TOK_SPACE,                  // Space            2
    TOK_VAR,                    // var              3
    TOK_CONST,                  // const            4
    TOK_ASSIGN,                 // =                5
    TOK_INT_LIT,                // Integer literal  6
    TOK_FLOAT_LIT,              // Float literal    7
    TOK_EXPO_LIT,              // Float literal    7
    TOK_BOOL_LIT,               // Bool literal     8
    TOK_IDENTIFIER,             // Identifier       9
} Hx_Token_Type;

typedef struct Token
{
    Hx_Token_Type type;

    char *value;
    fpos_t cursor;
    int line;
} Token;

typedef enum {
    // Compiler Errors
    UNEXPECTED_TOKEN,
    UNEXPECTED_EOF,
    UNCLOSED_STRING,
    UNCLOSED_COMMENT,
    UNCLOSED_PAREN,
    UNCLOSED_BRACE,
    UNCLOSED_BRACKET,
    UNCLOSED_ANGLE_BRACKET,

    // Compiler Warnings
    UNUSED_IDENTIFIER,
    DEPRECATED_FEATURE,

    // 

} Hx_Compiler_Error_Types;

typedef struct {
    Hx_Compiler_Error_Types type;
    const char* err_message;
    size_t at_pos;
    size_t at_line;
} Hx_Compiler_Error;

// a type to hold the list of Hx_Compiler_Error

// Compiler metadata
typedef struct {
    Hx_Compiler_Error* Hx_Compiler_Errors;
    
    size_t Hx_Compiling_Duration;
    size_t Hx_Bytes_Compiled;

} Hx_Compiler_Metadata;

typedef struct {
    FILE* fp;

    int line;

    Token* prev_token;
    Token* curr_token;
    Token* next_token;

} Hx_File_Parsing_Context;

typedef struct {
    char* data;
    size_t length;
    size_t capacity;
} String;

String* init_string() {
    String* string = malloc(sizeof(String));
    if (string == NULL) {
        printf("Error: Failed to allocate memory\n");
        exit(EXIT_FAILURE);
    }

    string->length = 1;
    string->capacity = 8;
    string->data = malloc(string->capacity * sizeof(char));
    if (string->data == NULL) {
        printf("Error: Failed to allocate memory\n");
        exit(EXIT_FAILURE);
    }
    string->data[0] = '\0';

    return string;
}

// Function to append a char to a String
String* append_char_to_string(String* string, int ch) {
    if (string->length + 1 >= string->capacity) {
        // Double the capacity
        string->capacity *= 2;

        // Reallocate memory with double the capacity
        string->data = realloc(string->data, string->capacity * sizeof(char));
        if (string->data == NULL) {
            printf("Error: Failed to allocate memory\n");
            exit(EXIT_FAILURE);
        }
    }

    // Append the character to the string
    string->data[string->length] = (char) ch;
    string->data[string->length + 1] = '\0';
    string->length += 1;

    return string;
}

// Function to free the memory allocated for a String
void free_string(String* string) {
    free(string->data);
    free(string);
}

Token* get_next_token(Hx_File_Parsing_Context* ctx, Hx_Compiler_Metadata* meta) {
    // get the next token
    int ch = fgetc(ctx->fp);

    // setup return token
    Hx_Token_Type tok_type;
    String* tok_val = init_string();

    // Handle end of file
    if (ch == EOF) {
        tok_type = TOK_EOF;

        // Handle spaces & newlines. Roll all spaces into a single space token
    } else if (isspace(ch)) {
        do {
            if (ch == '\n') {
                ctx->line++;
            }
            ch = fgetc(ctx->fp);
        } while (isspace(ch));

        ungetc(ch, ctx->fp);
        tok_type = TOK_SPACE;

        // Handle Floats, exponentials and Integers
    } else if (isdigit(ch)) {
        tok_val = append_char_to_string(tok_val, ch);
        tok_type = TOK_INT_LIT;

        // roll all digits into a single float
        do {
            ch = fgetc(ctx->fp);
            if (ch == '_') {
                continue;
            } else if (ch == '.') {
                if (tok_type == TOK_INT_LIT) {
                    tok_val = append_char_to_string(tok_val, ch);
                    tok_type = TOK_FLOAT_LIT;
                } else {
                    ungetc(ch, ctx->fp);
                    break;
                }
            } else if (ch == 'e') {
                if (tok_type == TOK_INT_LIT || tok_type == TOK_FLOAT_LIT) {
                    tok_val = append_char_to_string(tok_val, ch);
                    tok_type = TOK_EXPO_LIT;
                } else {
                    ungetc(ch, ctx->fp);
                    break;
                }
            } else if (isdigit(ch)) {
                tok_val = append_char_to_string(tok_val, ch);
            } else {
                ungetc(ch, ctx->fp);
                break;
            }
        } while (isdigit(ch) || ch == '.' || ch == '_' || ch == 'e');

        // Handle Illegal Characters
    } else {
        tok_val = append_char_to_string(tok_val, ch);
        tok_type = TOK_ILLEGAL;
    }

    Token* res = malloc(sizeof(Token));
    res->type = tok_type;
    res->value = tok_val->data;
    res->line = ctx->line;
    free_string(tok_val);

    return res;
}

// parse_file
int parse_file(Hx_File_Parsing_Context* ctx, Hx_Compiler_Metadata* meta) {
    // advance the tokens to fill the prev, curr, and next tokens
    ctx->prev_token = ctx->curr_token;
    ctx->curr_token = ctx->next_token;
    ctx->next_token = get_next_token(ctx, meta);

    printf("\nAll Tokens in the file = \n");

    do {
        free(ctx->prev_token->value);
        free(ctx->prev_token);

        // advance the tokens in a loop till file ends
        ctx->prev_token = ctx->curr_token;
        ctx->curr_token = ctx->next_token;
        ctx->next_token = get_next_token(ctx, meta);
        // printf("{ type: %i, value: %s, line: %i },\n", ctx->curr_token->type, ctx->curr_token->value, ctx->curr_token->line);
        printf("{ type: %i, value: %s, line: %i },\n", (int)ctx->curr_token->type, ctx->curr_token->value, ctx->curr_token->line);

    } while (ctx->curr_token->type != TOK_EOF);

    return 0;
}

// Section for handling multiple files
int compile_source(char* entry_file) {
    // parse = tokenize + parse + typecheck
    // compile = parse + generate
    // present = compile + present + run

    Hx_File_Parsing_Context ctx;
    ctx.line = 1;

    Hx_Compiler_Metadata meta;
    meta.Hx_Bytes_Compiled = 0;
    meta.Hx_Compiling_Duration = 0;
    meta.Hx_Compiler_Errors = NULL;

    ctx.fp = fopen(entry_file, "r, ccs=UTF-8");
    if (ctx.fp == NULL) {
        printf("Error: Failed to open file\n");
        exit(EXIT_FAILURE);
    }
    printf("Parsing file: %s\n", entry_file);
    int res = parse_file(&ctx, &meta);

    fclose(ctx.fp);

    return res;
}

int main(int argc, char* argv[]) {

    int res = compile_source("input/test.hex");

    return res;
}

这是我在控制台中看到的:

src\hexal.c:251:14: warning: 'fopen' is deprecated: This
      function or variable may be unsafe. Consider using
      fopen_s instead. To disable deprecation, use
      _CRT_SECURE_NO_WARNINGS. See online help for
      details. [-Wdeprecated-declarations]
    ctx.fp = fopen(entry_file, "r, ccs=UTF-8");
             ^
C:\Program Files (x86)\Windows Kits\10\Include\10.0.19041.0\ucrt\stdio.h:212:20: note: 
      'fopen' has been explicitly marked deprecated here
    _Check_return_ _CRT_INSECURE_DEPRECATE(fopen_s)
                   ^
C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.35.32215\include\vcruntime.h:355:55: note: 
      expanded from macro '_CRT_INSECURE_DEPRECATE'
        #define _CRT_INSECURE_DEPRECATE(_Replacement) _C...
                                                      ^
C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.35.32215\include\vcruntime.h:345:47: note: 
      expanded from macro '_CRT_DEPRECATE_TEXT'
#define _CRT_DEPRECATE_TEXT(_Text) __declspec(deprecated...
                                              ^
1 warning generated.
   Creating library bin\hexal.lib and object bin\hexal.exp
Parsing file: input/test.hex

C:\Users\risharan\Documents\GitHub\seawitch>cls && clang -g3 -Wall -Wextra -Wconversion -Wdouble-promotion -Wno-unused-parameter -Wno-unused-function -Wno-sign-conversion -fsanitize=undefined src\hexal.c -o bin\hexal.exe && bin\hexal.exe
eivgtgni

eivgtgni1#

在我的系统上运行你的代码,我得到这个运行时错误:

230520-token(77235,0x20a1d8140) malloc: *** error for object 0x40000: pointer being freed was not allocated
230520-token(77235,0x20a1d8140) malloc: *** set a breakpoint in malloc_error_break to debug
/bin/sh: line 1: 77235 Abort trap: 6           ./230520-token
make: *** [230520-token.run] Error 134

阅读你的代码,我可以看到这些问题:

*[major]ctxmeta都没有在main中初始化。您应该初始化这些结构以确保所有指针都为null:

Hx_File_Parsing_Context ctx = { 0 };
Hx_Compiler_Metadata meta = { 0 };

***[major]**在释放上一个token时,需要测试它是否已经分配:

if (ctx->prev_token) {
      free(ctx->prev_token->value);
      free(ctx->prev_token);
  }

***[major]**在init_string中,string->length的初始值应为0

  • append_char_to_string不会改变它的第一个参数,您不需要将tok_val更新为tok_val = append_char_to_string(tok_val, ch);,只需使用
append_char_to_string(tok_val, ch);
  • 读取数字的循环应该是for (;;)循环。使用do/while循环增加了冗余测试。不支持十六进制数。
  • 你应该在get_next_token的开头保存行号,这样line字段就被设置为令牌开始的那一行。当您解析多行标记(例如注解和带转义换行符的多行标记)时,这将更加重要。
  • 你应该解析标识符、运算符、字符串、字符常量、注解……
    ***[major]**你用free_string(tok_val);释放了字符串,但你保存了指向令牌结构的内容指针,所以这个指针变得无效。您应该使用
res->value = strdup(tok_val->data);

解决了上述主要问题,我得到了以下输出:

All Tokens in the file =
{ type: 6, value: 1, line: 1 },
{ type: 2, value: , line: 2 },
{ type: 7, value: 2.34, line: 2 },
{ type: 2, value: , line: 5 },
{ type: 1, value: x, line: 5 },
{ type: 1, value: x, line: 5 },
{ type: 1, value: y, line: 5 },
{ type: 2, value: , line: 5 },
{ type: 1, value: =, line: 5 },
{ type: 2, value: , line: 5 },
{ type: 8, value: 5e6, line: 5 },
{ type: 2, value: , line: 6 },
{ type: 0, value: , line: 6 },

相关问题