diff options
author | Michael Smith <mikesmiffy128@gmail.com> | 2021-11-20 03:10:50 +0000 |
---|---|---|
committer | Michael Smith <mikesmiffy128@gmail.com> | 2021-11-20 03:18:08 +0000 |
commit | da6f343032cb01597dc7866e66f091adf3243a62 (patch) | |
tree | 870f8cb8e82bb42202ab92bea03fc6ab35ada7ca /src/3p/chibicc/preprocess.c |
Initial public snapshot
With code from Bill. Thanks Bill!
Diffstat (limited to 'src/3p/chibicc/preprocess.c')
-rw-r--r-- | src/3p/chibicc/preprocess.c | 1208 |
1 files changed, 1208 insertions, 0 deletions
diff --git a/src/3p/chibicc/preprocess.c b/src/3p/chibicc/preprocess.c new file mode 100644 index 0000000..cd8d1d8 --- /dev/null +++ b/src/3p/chibicc/preprocess.c @@ -0,0 +1,1208 @@ +// This file implements the C preprocessor. +// +// The preprocessor takes a list of tokens as an input and returns a +// new list of tokens as an output. +// +// The preprocessing language is designed in such a way that that's +// guaranteed to stop even if there is a recursive macro. +// Informally speaking, a macro is applied only once for each token. +// That is, if a macro token T appears in a result of direct or +// indirect macro expansion of T, T won't be expanded any further. +// For example, if T is defined as U, and U is defined as T, then +// token T is expanded to U and then to T and the macro expansion +// stops at that point. +// +// To achieve the above behavior, we attach for each token a set of +// macro names from which the token is expanded. The set is called +// "hideset". Hideset is initially empty, and every time we expand a +// macro, the macro name is added to the resulting tokens' hidesets. +// +// The above macro expansion algorithm is explained in this document +// written by Dave Prossor, which is used as a basis for the +// standard's wording: +// https://github.com/rui314/chibicc/wiki/cpp.algo.pdf + +#include "chibicc.h" + +typedef struct MacroParam MacroParam; +struct MacroParam { + MacroParam *next; + char *name; +}; + +typedef struct MacroArg MacroArg; +struct MacroArg { + MacroArg *next; + char *name; + bool is_va_args; + Token *tok; +}; + +typedef Token *macro_handler_fn(Token *); + +typedef struct Macro Macro; +struct Macro { + char *name; + bool is_objlike; // Object-like or function-like + MacroParam *params; + char *va_args_name; + Token *body; + macro_handler_fn *handler; +}; + +// `#if` can be nested, so we use a stack to manage nested `#if`s. +typedef struct CondIncl CondIncl; +struct CondIncl { + CondIncl *next; + enum { IN_THEN, IN_ELIF, IN_ELSE } ctx; + Token *tok; + bool included; +}; + +typedef struct Hideset Hideset; +struct Hideset { + Hideset *next; + char *name; +}; + +static HashMap macros; +static CondIncl *cond_incl; +static HashMap pragma_once; +static int include_next_idx; + +static Token *preprocess2(Token *tok); +static Macro *find_macro(Token *tok); + +static bool is_hash(Token *tok) { + return tok->at_bol && equal(tok, "#"); +} + +// Some preprocessor directives such as #include allow extraneous +// tokens before newline. This function skips such tokens. +static Token *skip_line(Token *tok) { + if (tok->at_bol) + return tok; + warn_tok(tok, "extra token"); + while (!tok->at_bol) + tok = tok->next; + return tok; +} + +static Token *copy_token(Token *tok) { + Token *t = calloc(1, sizeof(Token)); + *t = *tok; + t->next = NULL; + return t; +} + +static Token *new_eof(Token *tok) { + Token *t = copy_token(tok); + t->kind = TK_EOF; + t->len = 0; + return t; +} + +static Hideset *new_hideset(char *name) { + Hideset *hs = calloc(1, sizeof(Hideset)); + hs->name = name; + return hs; +} + +static Hideset *hideset_union(Hideset *hs1, Hideset *hs2) { + Hideset head = {}; + Hideset *cur = &head; + + for (; hs1; hs1 = hs1->next) + cur = cur->next = new_hideset(hs1->name); + cur->next = hs2; + return head.next; +} + +static bool hideset_contains(Hideset *hs, char *s, int len) { + for (; hs; hs = hs->next) + if (strlen(hs->name) == len && !strncmp(hs->name, s, len)) + return true; + return false; +} + +static Hideset *hideset_intersection(Hideset *hs1, Hideset *hs2) { + Hideset head = {}; + Hideset *cur = &head; + + for (; hs1; hs1 = hs1->next) + if (hideset_contains(hs2, hs1->name, strlen(hs1->name))) + cur = cur->next = new_hideset(hs1->name); + return head.next; +} + +static Token *add_hideset(Token *tok, Hideset *hs) { + Token head = {}; + Token *cur = &head; + + for (; tok; tok = tok->next) { + Token *t = copy_token(tok); + t->hideset = hideset_union(t->hideset, hs); + cur = cur->next = t; + } + return head.next; +} + +// Append tok2 to the end of tok1. +static Token *append(Token *tok1, Token *tok2) { + if (tok1->kind == TK_EOF) + return tok2; + + Token head = {}; + Token *cur = &head; + + for (; tok1->kind != TK_EOF; tok1 = tok1->next) + cur = cur->next = copy_token(tok1); + cur->next = tok2; + return head.next; +} + +static Token *skip_cond_incl2(Token *tok) { + while (tok->kind != TK_EOF) { + if (is_hash(tok) && + (equal(tok->next, "if") || equal(tok->next, "ifdef") || + equal(tok->next, "ifndef"))) { + tok = skip_cond_incl2(tok->next->next); + continue; + } + if (is_hash(tok) && equal(tok->next, "endif")) + return tok->next->next; + tok = tok->next; + } + return tok; +} + +// Skip until next `#else`, `#elif` or `#endif`. +// Nested `#if` and `#endif` are skipped. +static Token *skip_cond_incl(Token *tok) { + while (tok->kind != TK_EOF) { + if (is_hash(tok) && + (equal(tok->next, "if") || equal(tok->next, "ifdef") || + equal(tok->next, "ifndef"))) { + tok = skip_cond_incl2(tok->next->next); + continue; + } + + if (is_hash(tok) && + (equal(tok->next, "elif") || equal(tok->next, "else") || + equal(tok->next, "endif"))) + break; + tok = tok->next; + } + return tok; +} + +// Double-quote a given string and returns it. +static char *quote_string(char *str) { + int bufsize = 3; + for (int i = 0; str[i]; i++) { + if (str[i] == '\\' || str[i] == '"') + bufsize++; + bufsize++; + } + + char *buf = calloc(1, bufsize); + char *p = buf; + *p++ = '"'; + for (int i = 0; str[i]; i++) { + if (str[i] == '\\' || str[i] == '"') + *p++ = '\\'; + *p++ = str[i]; + } + *p++ = '"'; + *p++ = '\0'; + return buf; +} + +static Token *new_str_token(char *str, Token *tmpl) { + char *buf = quote_string(str); + return tokenize(new_file(tmpl->file->name, tmpl->file->file_no, buf)); +} + +// Copy all tokens until the next newline, terminate them with +// an EOF token and then returns them. This function is used to +// create a new list of tokens for `#if` arguments. +static Token *copy_line(Token **rest, Token *tok) { + Token head = {}; + Token *cur = &head; + + for (; !tok->at_bol; tok = tok->next) + cur = cur->next = copy_token(tok); + + cur->next = new_eof(tok); + *rest = tok; + return head.next; +} + +static Token *new_num_token(int val, Token *tmpl) { + char *buf = format("%d\n", val); + return tokenize(new_file(tmpl->file->name, tmpl->file->file_no, buf)); +} + +static Token *read_const_expr(Token **rest, Token *tok) { + tok = copy_line(rest, tok); + + Token head = {}; + Token *cur = &head; + + while (tok->kind != TK_EOF) { + // "defined(foo)" or "defined foo" becomes "1" if macro "foo" + // is defined. Otherwise "0". + if (equal(tok, "defined")) { + Token *start = tok; + bool has_paren = consume(&tok, tok->next, "("); + + if (tok->kind != TK_IDENT) + error_tok(start, "macro name must be an identifier"); + Macro *m = find_macro(tok); + tok = tok->next; + + if (has_paren) + tok = skip(tok, ")"); + + cur = cur->next = new_num_token(m ? 1 : 0, start); + continue; + } + + cur = cur->next = tok; + tok = tok->next; + } + + cur->next = tok; + return head.next; +} + +// Read and evaluate a constant expression. +static long eval_const_expr(Token **rest, Token *tok) { + Token *start = tok; + Token *expr = read_const_expr(rest, tok->next); + expr = preprocess2(expr); + + if (expr->kind == TK_EOF) + error_tok(start, "no expression"); + + // [https://www.sigbus.info/n1570#6.10.1p4] The standard requires + // we replace remaining non-macro identifiers with "0" before + // evaluating a constant expression. For example, `#if foo` is + // equivalent to `#if 0` if foo is not defined. + for (Token *t = expr; t->kind != TK_EOF; t = t->next) { + if (t->kind == TK_IDENT) { + Token *next = t->next; + *t = *new_num_token(0, t); + t->next = next; + } + } + + // Convert pp-numbers to regular numbers + convert_pp_tokens(expr); + + Token *rest2; + long val = const_expr(&rest2, expr); + if (rest2->kind != TK_EOF) + error_tok(rest2, "extra token"); + return val; +} + +static CondIncl *push_cond_incl(Token *tok, bool included) { + CondIncl *ci = calloc(1, sizeof(CondIncl)); + ci->next = cond_incl; + ci->ctx = IN_THEN; + ci->tok = tok; + ci->included = included; + cond_incl = ci; + return ci; +} + +static Macro *find_macro(Token *tok) { + if (tok->kind != TK_IDENT) + return NULL; + return hashmap_get2(¯os, tok->loc, tok->len); +} + +static Macro *add_macro(char *name, bool is_objlike, Token *body) { + Macro *m = calloc(1, sizeof(Macro)); + m->name = name; + m->is_objlike = is_objlike; + m->body = body; + hashmap_put(¯os, name, m); + return m; +} + +static MacroParam *read_macro_params(Token **rest, Token *tok, char **va_args_name) { + MacroParam head = {}; + MacroParam *cur = &head; + + while (!equal(tok, ")")) { + if (cur != &head) + tok = skip(tok, ","); + + if (equal(tok, "...")) { + *va_args_name = "__VA_ARGS__"; + *rest = skip(tok->next, ")"); + return head.next; + } + + if (tok->kind != TK_IDENT) + error_tok(tok, "expected an identifier"); + + if (equal(tok->next, "...")) { + *va_args_name = strndup(tok->loc, tok->len); + *rest = skip(tok->next->next, ")"); + return head.next; + } + + MacroParam *m = calloc(1, sizeof(MacroParam)); + m->name = strndup(tok->loc, tok->len); + cur = cur->next = m; + tok = tok->next; + } + + *rest = tok->next; + return head.next; +} + +static void read_macro_definition(Token **rest, Token *tok) { + if (tok->kind != TK_IDENT) + error_tok(tok, "macro name must be an identifier"); + char *name = strndup(tok->loc, tok->len); + tok = tok->next; + + if (!tok->has_space && equal(tok, "(")) { + // Function-like macro + char *va_args_name = NULL; + MacroParam *params = read_macro_params(&tok, tok->next, &va_args_name); + + Macro *m = add_macro(name, false, copy_line(rest, tok)); + m->params = params; + m->va_args_name = va_args_name; + } else { + // Object-like macro + add_macro(name, true, copy_line(rest, tok)); + } +} + +static MacroArg *read_macro_arg_one(Token **rest, Token *tok, bool read_rest) { + Token head = {}; + Token *cur = &head; + int level = 0; + + for (;;) { + if (level == 0 && equal(tok, ")")) + break; + if (level == 0 && !read_rest && equal(tok, ",")) + break; + + if (tok->kind == TK_EOF) + error_tok(tok, "premature end of input"); + + if (equal(tok, "(")) + level++; + else if (equal(tok, ")")) + level--; + + cur = cur->next = copy_token(tok); + tok = tok->next; + } + + cur->next = new_eof(tok); + + MacroArg *arg = calloc(1, sizeof(MacroArg)); + arg->tok = head.next; + *rest = tok; + return arg; +} + +static MacroArg * +read_macro_args(Token **rest, Token *tok, MacroParam *params, char *va_args_name) { + Token *start = tok; + tok = tok->next->next; + + MacroArg head = {}; + MacroArg *cur = &head; + + MacroParam *pp = params; + for (; pp; pp = pp->next) { + if (cur != &head) + tok = skip(tok, ","); + cur = cur->next = read_macro_arg_one(&tok, tok, false); + cur->name = pp->name; + } + + if (va_args_name) { + MacroArg *arg; + if (equal(tok, ")")) { + arg = calloc(1, sizeof(MacroArg)); + arg->tok = new_eof(tok); + } else { + if (pp != params) + tok = skip(tok, ","); + arg = read_macro_arg_one(&tok, tok, true); + } + arg->name = va_args_name;; + arg->is_va_args = true; + cur = cur->next = arg; + } else if (pp) { + error_tok(start, "too many arguments"); + } + + skip(tok, ")"); + *rest = tok; + return head.next; +} + +static MacroArg *find_arg(MacroArg *args, Token *tok) { + for (MacroArg *ap = args; ap; ap = ap->next) + if (tok->len == strlen(ap->name) && !strncmp(tok->loc, ap->name, tok->len)) + return ap; + return NULL; +} + +// Concatenates all tokens in `tok` and returns a new string. +static char *join_tokens(Token *tok, Token *end) { + // Compute the length of the resulting token. + int len = 1; + for (Token *t = tok; t != end && t->kind != TK_EOF; t = t->next) { + if (t != tok && t->has_space) + len++; + len += t->len; + } + + char *buf = calloc(1, len); + + // Copy token texts. + int pos = 0; + for (Token *t = tok; t != end && t->kind != TK_EOF; t = t->next) { + if (t != tok && t->has_space) + buf[pos++] = ' '; + strncpy(buf + pos, t->loc, t->len); + pos += t->len; + } + buf[pos] = '\0'; + return buf; +} + +// Concatenates all tokens in `arg` and returns a new string token. +// This function is used for the stringizing operator (#). +static Token *stringize(Token *hash, Token *arg) { + // Create a new string token. We need to set some value to its + // source location for error reporting function, so we use a macro + // name token as a template. + char *s = join_tokens(arg, NULL); + return new_str_token(s, hash); +} + +// Concatenate two tokens to create a new token. +static Token *paste(Token *lhs, Token *rhs) { + // Paste the two tokens. + char *buf = format("%.*s%.*s", lhs->len, lhs->loc, rhs->len, rhs->loc); + + // Tokenize the resulting string. + Token *tok = tokenize(new_file(lhs->file->name, lhs->file->file_no, buf)); + if (tok->next->kind != TK_EOF) + error_tok(lhs, "pasting forms '%s', an invalid token", buf); + return tok; +} + +static bool has_varargs(MacroArg *args) { + for (MacroArg *ap = args; ap; ap = ap->next) + if (!strcmp(ap->name, "__VA_ARGS__")) + return ap->tok->kind != TK_EOF; + return false; +} + +// Replace func-like macro parameters with given arguments. +static Token *subst(Token *tok, MacroArg *args) { + Token head = {}; + Token *cur = &head; + + while (tok->kind != TK_EOF) { + // "#" followed by a parameter is replaced with stringized actuals. + if (equal(tok, "#")) { + MacroArg *arg = find_arg(args, tok->next); + if (!arg) + error_tok(tok->next, "'#' is not followed by a macro parameter"); + cur = cur->next = stringize(tok, arg->tok); + tok = tok->next->next; + continue; + } + + // [GNU] If __VA_ARG__ is empty, `,##__VA_ARGS__` is expanded + // to the empty token list. Otherwise, its expaned to `,` and + // __VA_ARGS__. + if (equal(tok, ",") && equal(tok->next, "##")) { + MacroArg *arg = find_arg(args, tok->next->next); + if (arg && arg->is_va_args) { + if (arg->tok->kind == TK_EOF) { + tok = tok->next->next->next; + } else { + cur = cur->next = copy_token(tok); + tok = tok->next->next; + } + continue; + } + } + + if (equal(tok, "##")) { + if (cur == &head) + error_tok(tok, "'##' cannot appear at start of macro expansion"); + + if (tok->next->kind == TK_EOF) + error_tok(tok, "'##' cannot appear at end of macro expansion"); + + MacroArg *arg = find_arg(args, tok->next); + if (arg) { + if (arg->tok->kind != TK_EOF) { + *cur = *paste(cur, arg->tok); + for (Token *t = arg->tok->next; t->kind != TK_EOF; t = t->next) + cur = cur->next = copy_token(t); + } + tok = tok->next->next; + continue; + } + + *cur = *paste(cur, tok->next); + tok = tok->next->next; + continue; + } + + MacroArg *arg = find_arg(args, tok); + + if (arg && equal(tok->next, "##")) { + Token *rhs = tok->next->next; + + if (arg->tok->kind == TK_EOF) { + MacroArg *arg2 = find_arg(args, rhs); + if (arg2) { + for (Token *t = arg2->tok; t->kind != TK_EOF; t = t->next) + cur = cur->next = copy_token(t); + } else { + cur = cur->next = copy_token(rhs); + } + tok = rhs->next; + continue; + } + + for (Token *t = arg->tok; t->kind != TK_EOF; t = t->next) + cur = cur->next = copy_token(t); + tok = tok->next; + continue; + } + + // If __VA_ARG__ is empty, __VA_OPT__(x) is expanded to the + // empty token list. Otherwise, __VA_OPT__(x) is expanded to x. + if (equal(tok, "__VA_OPT__") && equal(tok->next, "(")) { + MacroArg *arg = read_macro_arg_one(&tok, tok->next->next, true); + if (has_varargs(args)) + for (Token *t = arg->tok; t->kind != TK_EOF; t = t->next) + cur = cur->next = t; + tok = skip(tok, ")"); + continue; + } + + // Handle a macro token. Macro arguments are completely macro-expanded + // before they are substituted into a macro body. + if (arg) { + Token *t = preprocess2(arg->tok); + t->at_bol = tok->at_bol; + t->has_space = tok->has_space; + for (; t->kind != TK_EOF; t = t->next) + cur = cur->next = copy_token(t); + tok = tok->next; + continue; + } + + // Handle a non-macro token. + cur = cur->next = copy_token(tok); + tok = tok->next; + continue; + } + + cur->next = tok; + return head.next; +} + +// If tok is a macro, expand it and return true. +// Otherwise, do nothing and return false. +static bool expand_macro(Token **rest, Token *tok) { + if (hideset_contains(tok->hideset, tok->loc, tok->len)) + return false; + + Macro *m = find_macro(tok); + if (!m) + return false; + + // Built-in dynamic macro application such as __LINE__ + if (m->handler) { + *rest = m->handler(tok); + (*rest)->next = tok->next; + return true; + } + + // Object-like macro application + if (m->is_objlike) { + Hideset *hs = hideset_union(tok->hideset, new_hideset(m->name)); + Token *body = add_hideset(m->body, hs); + for (Token *t = body; t->kind != TK_EOF; t = t->next) + t->origin = tok; + *rest = append(body, tok->next); + (*rest)->at_bol = tok->at_bol; + (*rest)->has_space = tok->has_space; + return true; + } + + // If a funclike macro token is not followed by an argument list, + // treat it as a normal identifier. + if (!equal(tok->next, "(")) + return false; + + // Function-like macro application + Token *macro_token = tok; + MacroArg *args = read_macro_args(&tok, tok, m->params, m->va_args_name); + Token *rparen = tok; + + // Tokens that consist a func-like macro invocation may have different + // hidesets, and if that's the case, it's not clear what the hideset + // for the new tokens should be. We take the interesection of the + // macro token and the closing parenthesis and use it as a new hideset + // as explained in the Dave Prossor's algorithm. + Hideset *hs = hideset_intersection(macro_token->hideset, rparen->hideset); + hs = hideset_union(hs, new_hideset(m->name)); + + Token *body = subst(m->body, args); + body = add_hideset(body, hs); + for (Token *t = body; t->kind != TK_EOF; t = t->next) + t->origin = macro_token; + *rest = append(body, tok->next); + (*rest)->at_bol = macro_token->at_bol; + (*rest)->has_space = macro_token->has_space; + return true; +} + +char *search_include_paths(char *filename) { + if (filename[0] == '/') + return filename; + + static HashMap cache; + char *cached = hashmap_get(&cache, filename); + if (cached) + return cached; + + // Search a file from the include paths. + for (int i = 0; i < include_paths.len; i++) { + char *path = format("%s/%s", include_paths.data[i], filename); + if (!file_exists(path)) + continue; + hashmap_put(&cache, filename, path); + include_next_idx = i + 1; + return path; + } + return NULL; +} + +static char *search_include_next(char *filename) { + for (; include_next_idx < include_paths.len; include_next_idx++) { + char *path = format("%s/%s", include_paths.data[include_next_idx], filename); + if (file_exists(path)) + return path; + } + return NULL; +} + +// Read an #include argument. +static char *read_include_filename(Token **rest, Token *tok, bool *is_dquote) { + // Pattern 1: #include "foo.h" + if (tok->kind == TK_STR) { + // A double-quoted filename for #include is a special kind of + // token, and we don't want to interpret any escape sequences in it. + // For example, "\f" in "C:\foo" is not a formfeed character but + // just two non-control characters, backslash and f. + // So we don't want to use token->str. + *is_dquote = true; + *rest = skip_line(tok->next); + return strndup(tok->loc + 1, tok->len - 2); + } + + // Pattern 2: #include <foo.h> + if (equal(tok, "<")) { + // Reconstruct a filename from a sequence of tokens between + // "<" and ">". + Token *start = tok; + + // Find closing ">". + for (; !equal(tok, ">"); tok = tok->next) + if (tok->at_bol || tok->kind == TK_EOF) + error_tok(tok, "expected '>'"); + + *is_dquote = false; + *rest = skip_line(tok->next); + return join_tokens(start->next, tok); + } + + // Pattern 3: #include FOO + // In this case FOO must be macro-expanded to either + // a single string token or a sequence of "<" ... ">". + if (tok->kind == TK_IDENT) { + Token *tok2 = preprocess2(copy_line(rest, tok)); + return read_include_filename(&tok2, tok2, is_dquote); + } + + error_tok(tok, "expected a filename"); +} + +// Detect the following "include guard" pattern. +// +// #ifndef FOO_H +// #define FOO_H +// ... +// #endif +static char *detect_include_guard(Token *tok) { + // Detect the first two lines. + if (!is_hash(tok) || !equal(tok->next, "ifndef")) + return NULL; + tok = tok->next->next; + + if (tok->kind != TK_IDENT) + return NULL; + + char *macro = strndup(tok->loc, tok->len); + tok = tok->next; + + if (!is_hash(tok) || !equal(tok->next, "define") || !equal(tok->next->next, macro)) + return NULL; + + // Read until the end of the file. + while (tok->kind != TK_EOF) { + if (!is_hash(tok)) { + tok = tok->next; + continue; + } + + if (equal(tok->next, "endif") && tok->next->next->kind == TK_EOF) + return macro; + + if (equal(tok, "if") || equal(tok, "ifdef") || equal(tok, "ifndef")) + tok = skip_cond_incl(tok->next); + else + tok = tok->next; + } + return NULL; +} + +static Token *include_file(Token *tok, char *path, Token *filename_tok) { + // Check for "#pragma once" + if (hashmap_get(&pragma_once, path)) + return tok; + + // If we read the same file before, and if the file was guarded + // by the usual #ifndef ... #endif pattern, we may be able to + // skip the file without opening it. + static HashMap include_guards; + char *guard_name = hashmap_get(&include_guards, path); + if (guard_name && hashmap_get(¯os, guard_name)) + return tok; + + Token *tok2 = tokenize_file(path); + if (!tok2) + error_tok(filename_tok, "%s: cannot open file: %s", path, strerror(errno)); + + guard_name = detect_include_guard(tok2); + if (guard_name) + hashmap_put(&include_guards, path, guard_name); + + return append(tok2, tok); +} + +// Read #line arguments +static void read_line_marker(Token **rest, Token *tok) { + Token *start = tok; + tok = preprocess(copy_line(rest, tok)); + + if (tok->kind != TK_NUM || tok->ty->kind != TY_INT) + error_tok(tok, "invalid line marker"); + start->file->line_delta = tok->val - start->line_no; + + tok = tok->next; + if (tok->kind == TK_EOF) + return; + + if (tok->kind != TK_STR) + error_tok(tok, "filename expected"); + start->file->display_name = tok->str; +} + +// Visit all tokens in `tok` while evaluating preprocessing +// macros and directives. +static Token *preprocess2(Token *tok) { + Token head = {}; + Token *cur = &head; + + while (tok->kind != TK_EOF) { + // If it is a macro, expand it. + if (expand_macro(&tok, tok)) + continue; + + // Pass through if it is not a "#". + if (!is_hash(tok)) { + tok->line_delta = tok->file->line_delta; + tok->filename = tok->file->display_name; + cur = cur->next = tok; + tok = tok->next; + continue; + } + + Token *start = tok; + tok = tok->next; + + if (equal(tok, "include")) { + bool is_dquote; + char *filename = read_include_filename(&tok, tok->next, &is_dquote); + + if (filename[0] != '/' && is_dquote) { + char *path = format("%s/%s", dirname(strdup(start->file->name)), filename); + if (file_exists(path)) { + tok = include_file(tok, path, start->next->next); + continue; + } + } + + char *path = search_include_paths(filename); + tok = include_file(tok, path ? path : filename, start->next->next); + continue; + } + + if (equal(tok, "include_next")) { + bool ignore; + char *filename = read_include_filename(&tok, tok->next, &ignore); + char *path = search_include_next(filename); + tok = include_file(tok, path ? path : filename, start->next->next); + continue; + } + + if (equal(tok, "define")) { + read_macro_definition(&tok, tok->next); + continue; + } + + if (equal(tok, "undef")) { + tok = tok->next; + if (tok->kind != TK_IDENT) + error_tok(tok, "macro name must be an identifier"); + undef_macro(strndup(tok->loc, tok->len)); + tok = skip_line(tok->next); + continue; + } + + if (equal(tok, "if")) { + long val = eval_const_expr(&tok, tok); + push_cond_incl(start, val); + if (!val) + tok = skip_cond_incl(tok); + continue; + } + + if (equal(tok, "ifdef")) { + bool defined = find_macro(tok->next); + push_cond_incl(tok, defined); + tok = skip_line(tok->next->next); + if (!defined) + tok = skip_cond_incl(tok); + continue; + } + + if (equal(tok, "ifndef")) { + bool defined = find_macro(tok->next); + push_cond_incl(tok, !defined); + tok = skip_line(tok->next->next); + if (defined) + tok = skip_cond_incl(tok); + continue; + } + + if (equal(tok, "elif")) { + if (!cond_incl || cond_incl->ctx == IN_ELSE) + error_tok(start, "stray #elif"); + cond_incl->ctx = IN_ELIF; + + if (!cond_incl->included && eval_const_expr(&tok, tok)) + cond_incl->included = true; + else + tok = skip_cond_incl(tok); + continue; + } + + if (equal(tok, "else")) { + if (!cond_incl || cond_incl->ctx == IN_ELSE) + error_tok(start, "stray #else"); + cond_incl->ctx = IN_ELSE; + tok = skip_line(tok->next); + + if (cond_incl->included) + tok = skip_cond_incl(tok); + continue; + } + + if (equal(tok, "endif")) { + if (!cond_incl) + error_tok(start, "stray #endif"); + cond_incl = cond_incl->next; + tok = skip_line(tok->next); + continue; + } + + if (equal(tok, "line")) { + read_line_marker(&tok, tok->next); + continue; + } + + if (tok->kind == TK_PP_NUM) { + read_line_marker(&tok, tok); + continue; + } + + if (equal(tok, "pragma") && equal(tok->next, "once")) { + hashmap_put(&pragma_once, tok->file->name, (void *)1); + tok = skip_line(tok->next->next); + continue; + } + + if (equal(tok, "pragma")) { + do { + tok = tok->next; + } while (!tok->at_bol); + continue; + } + + if (equal(tok, "error")) + error_tok(tok, "error"); + + // `#`-only line is legal. It's called a null directive. + if (tok->at_bol) + continue; + + error_tok(tok, "invalid preprocessor directive"); + } + + cur->next = tok; + return head.next; +} + +void define_macro(char *name, char *buf) { + Token *tok = tokenize(new_file("<built-in>", 1, buf)); + add_macro(name, true, tok); +} + +void undef_macro(char *name) { + hashmap_delete(¯os, name); +} + +static Macro *add_builtin(char *name, macro_handler_fn *fn) { + Macro *m = add_macro(name, true, NULL); + m->handler = fn; + return m; +} + +static Token *file_macro(Token *tmpl) { + while (tmpl->origin) + tmpl = tmpl->origin; + return new_str_token(tmpl->file->display_name, tmpl); +} + +static Token *line_macro(Token *tmpl) { + while (tmpl->origin) + tmpl = tmpl->origin; + int i = tmpl->line_no + tmpl->file->line_delta; + return new_num_token(i, tmpl); +} + +// __COUNTER__ is expanded to serial values starting from 0. +static Token *counter_macro(Token *tmpl) { + static int i = 0; + return new_num_token(i++, tmpl); +} + +// __TIMESTAMP__ is expanded to a string describing the last +// modification time of the current file. E.g. +// "Fri Jul 24 01:32:50 2020" +static Token *timestamp_macro(Token *tmpl) { + struct stat st; + if (stat(tmpl->file->name, &st) != 0) + return new_str_token("??? ??? ?? ??:??:?? ????", tmpl); + + char buf[30]; + ctime_r(&st.st_mtime, buf); + buf[24] = '\0'; + return new_str_token(buf, tmpl); +} + +static Token *base_file_macro(Token *tmpl) { + return new_str_token(base_file, tmpl); +} + +// __DATE__ is expanded to the current date, e.g. "May 17 2020". +static char *format_date(struct tm *tm) { + static char mon[][4] = { + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", + }; + + return format("\"%s %2d %d\"", mon[tm->tm_mon], tm->tm_mday, tm->tm_year + 1900); +} + +// __TIME__ is expanded to the current time, e.g. "13:34:03". +static char *format_time(struct tm *tm) { + return format("\"%02d:%02d:%02d\"", tm->tm_hour, tm->tm_min, tm->tm_sec); +} + +void init_macros(void) { + // Define predefined macros + define_macro("_LP64", "1"); + define_macro("__C99_MACRO_WITH_VA_ARGS", "1"); + define_macro("__ELF__", "1"); + define_macro("__LP64__", "1"); + define_macro("__SIZEOF_DOUBLE__", "8"); + define_macro("__SIZEOF_FLOAT__", "4"); + define_macro("__SIZEOF_INT__", "4"); + define_macro("__SIZEOF_LONG_DOUBLE__", "8"); + define_macro("__SIZEOF_LONG_LONG__", "8"); + define_macro("__SIZEOF_LONG__", "8"); + define_macro("__SIZEOF_POINTER__", "8"); + define_macro("__SIZEOF_PTRDIFF_T__", "8"); + define_macro("__SIZEOF_SHORT__", "2"); + define_macro("__SIZEOF_SIZE_T__", "8"); + define_macro("__SIZE_TYPE__", "unsigned long"); + define_macro("__STDC_HOSTED__", "1"); + define_macro("__STDC_NO_COMPLEX__", "1"); + define_macro("__STDC_UTF_16__", "1"); + define_macro("__STDC_UTF_32__", "1"); + define_macro("__STDC_VERSION__", "201112L"); + define_macro("__STDC__", "1"); + define_macro("__USER_LABEL_PREFIX__", ""); + define_macro("__alignof__", "_Alignof"); + define_macro("__amd64", "1"); + define_macro("__amd64__", "1"); + define_macro("__chibicc__", "1"); + define_macro("__const__", "const"); + define_macro("__gnu_linux__", "1"); + define_macro("__inline__", "inline"); + define_macro("__linux", "1"); + define_macro("__linux__", "1"); + define_macro("__signed__", "signed"); + define_macro("__typeof__", "typeof"); + define_macro("__unix", "1"); + define_macro("__unix__", "1"); + define_macro("__volatile__", "volatile"); + define_macro("__x86_64", "1"); + define_macro("__x86_64__", "1"); + define_macro("linux", "1"); + define_macro("unix", "1"); + + add_builtin("__FILE__", file_macro); + add_builtin("__LINE__", line_macro); + add_builtin("__COUNTER__", counter_macro); + add_builtin("__TIMESTAMP__", timestamp_macro); + add_builtin("__BASE_FILE__", base_file_macro); + + time_t now = time(NULL); + struct tm *tm = localtime(&now); + define_macro("__DATE__", format_date(tm)); + define_macro("__TIME__", format_time(tm)); +} + +typedef enum { + STR_NONE, STR_UTF8, STR_UTF16, STR_UTF32, STR_WIDE, +} StringKind; + +static StringKind getStringKind(Token *tok) { + if (!strcmp(tok->loc, "u8")) + return STR_UTF8; + + switch (tok->loc[0]) { + case '"': return STR_NONE; + case 'u': return STR_UTF16; + case 'U': return STR_UTF32; + case 'L': return STR_WIDE; + } + unreachable(); +} + +// Concatenate adjacent string literals into a single string literal +// as per the C spec. +static void join_adjacent_string_literals(Token *tok) { + // First pass: If regular string literals are adjacent to wide + // string literals, regular string literals are converted to a wide + // type before concatenation. In this pass, we do the conversion. + for (Token *tok1 = tok; tok1->kind != TK_EOF;) { + if (tok1->kind != TK_STR || tok1->next->kind != TK_STR) { + tok1 = tok1->next; + continue; + } + + StringKind kind = getStringKind(tok1); + Type *basety = tok1->ty->base; + + for (Token *t = tok1->next; t->kind == TK_STR; t = t->next) { + StringKind k = getStringKind(t); + if (kind == STR_NONE) { + kind = k; + basety = t->ty->base; + } else if (k != STR_NONE && kind != k) { + error_tok(t, "unsupported non-standard concatenation of string literals"); + } + } + + if (basety->size > 1) + for (Token *t = tok1; t->kind == TK_STR; t = t->next) + if (t->ty->base->size == 1) + *t = *tokenize_string_literal(t, basety); + + while (tok1->kind == TK_STR) + tok1 = tok1->next; + } + + // Second pass: concatenate adjacent string literals. + for (Token *tok1 = tok; tok1->kind != TK_EOF;) { + if (tok1->kind != TK_STR || tok1->next->kind != TK_STR) { + tok1 = tok1->next; + continue; + } + + Token *tok2 = tok1->next; + while (tok2->kind == TK_STR) + tok2 = tok2->next; + + int len = tok1->ty->array_len; + for (Token *t = tok1->next; t != tok2; t = t->next) + len = len + t->ty->array_len - 1; + + char *buf = calloc(tok1->ty->base->size, len); + + int i = 0; + for (Token *t = tok1; t != tok2; t = t->next) { + memcpy(buf + i, t->str, t->ty->size); + i = i + t->ty->size - t->ty->base->size; + } + + *tok1 = *copy_token(tok1); + tok1->ty = array_of(tok1->ty->base, len); + tok1->str = buf; + tok1->next = tok2; + tok1 = tok2; + } +} + +// Entry point function of the preprocessor. +Token *preprocess(Token *tok) { + tok = preprocess2(tok); + if (cond_incl) + error_tok(cond_incl->tok, "unterminated conditional directive"); + convert_pp_tokens(tok); + join_adjacent_string_literals(tok); + + for (Token *t = tok; t; t = t->next) + t->line_no += t->line_delta; + return tok; +} |