Primitive code aware syntax highlighting

After a few not-quite-fast-enough attempts at brining code aware syntax highlighting to my 4coder custom layer I am happy to report that it's finally working! This includes highlighting of functions, macros, and types (everything that's currently being stored by the code index). Preview with placeholder colors:



As this seems to be quite a popular feature in other code editors, I though that I'd share my implementation. The idea is simple: tokens are hashed and placed into a hash table. When determining if a token should be highlighted its hash is recomputed at looked up in the table. With this setup I finally got it to not dip below 60fps (tested on the 4coder custom layer project).

I rolled the simplest possible separate chaining hashtable.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
struct Code_Index_Note_Mini {
    Code_Index_Note_Mini *next;
    Code_Index_Note_Kind note_kind;
    u64 hash;
};

struct Code_Index_Table {
    Code_Index_Note_Mini **table;
    i32 size;
};

struct Code_Index_File{
    Code_Index_Nest_List nest_list;
    Code_Index_Nest_Ptr_Array nest_array;
    Code_Index_Note_List note_list;
    Code_Index_Note_Ptr_Array note_array;
    Code_Index_Table note_table; /* <--- this */
    Buffer_ID buffer;
};


Table construction and lookup:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
function Code_Index_Table
code_index_note_table_from_array(Arena *arena, Code_Index_Note_Ptr_Array note_array)
{
    Code_Index_Table result = {};
    
    result.size = 1000;
    result.table = push_array_zero(arena, Code_Index_Note_Mini*, result.size);
    
    for (i32 i = 0; i < note_array.count; ++i) {
        struct Code_Index_Note_Mini *mini_note = push_array(arena, Code_Index_Note_Mini, 1);
        
        mini_note->hash = _djb2(note_array.ptrs[i]->text);
        mini_note->note_kind = note_array.ptrs[i]->note_kind;
        mini_note->next = 0;
        
        i32 bucket = mini_note->hash % result.size;
        if (result.table[bucket] != 0) {
            mini_note->next = result.table[bucket];
        }
        result.table[bucket] = mini_note;
    }
    
    return(result);
}

function Code_Index_Note_Kind
code_index_table_lookup(Code_Index_Table table, u64 hash)
{
    i32 bucket = hash % table.size;
    
    if (table.table[bucket] == 0) {
        return(CodeIndexNote_NONE);
    } 
    
    Code_Index_Note_Mini *note = table.table[bucket];
    
    while (note) {
        if (note->hash == hash) {
            return(note->note_kind);
        }
        note = note->next;
    }
    
    return(CodeIndexNote_NONE);
}


The construction routine is called at the end of generic_parse_full_input_breaks:
1
2
3
4
5
6
...
if (result){
        index->nest_array = code_index_nest_ptr_array_from_list(state->arena, &index->nest_list);
        index->note_array = code_index_note_ptr_array_from_list(state->arena, &index->note_list);
        index->note_table = code_index_note_table_from_array(state->arena, index->note_array);  /* <--- this */
}


Updated draw_cpp_token_colors with the lookup:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
function void
draw_cpp_token_colors(Application_Links *app, Text_Layout_ID text_layout_id, Token_Array *array, Buffer_ID current_buffer){
    Range_i64 visible_range = text_layout_get_visible_range(app, text_layout_id);
    i64 first_index = token_index_from_pos(array, visible_range.first);
    Token_Iterator_Array it = token_iterator_index(0, array, first_index);
    for (;;){
        Token *token = token_it_read(&it);
        if (token->pos >= visible_range.one_past_last){
            break;
        }
        
        String_Const_u8 token_text = push_token_lexeme(app, &global_code_index.node_arena, current_buffer, token);
        u64 hash = _djb2(token_text);
        bool found = false;
        
        for (Buffer_ID buffer = get_buffer_next(app, 0, Access_Always);
             !found && buffer != 0;
             buffer = get_buffer_next(app, buffer, Access_Always))
        {
            Code_Index_File *file = code_index_get_file(buffer);
            if (file != 0) {            
                Code_Index_Note_Kind note_kind = code_index_table_lookup(file->note_table, hash);
                switch (note_kind) {
                    case CodeIndexNote_Function: {
                        token->kind = TokenBaseKind_UserDefinedFunction;
                        found = true;
                        break;
                    };
                    
                    case CodeIndexNote_Macro: {
                        token->kind = TokenBaseKind_UserDefinedMacro;
                        found = true;
                        break;
                    };
                    
                    case CodeIndexNote_Type: {
                        token->kind = TokenBaseKind_UserDefinedType;
                        found = true;
                        break;
                    }
                }
            }
        }
        
        FColor color = get_token_color_cpp(*token);
        ARGB_Color argb = fcolor_resolve(color);
        paint_text_color(app, text_layout_id, Ii64_size(token->pos, token->size), argb);
        if (!token_it_inc_all(&it)){
            break;
        }
    }
}


You also need to add a few items to the Token_Base_Kind enum, as well as register new colors in 4coder_default_colors.h, but that's not worth posting imo (as if everything above is..)

Edited by Alexey on Reason: Initial post
Nice!

If performance starts to become troublesome, I wonder if making
1
Code_Index_Note_Mini
more cache friendly might grant the largest gain for the time investment.

Edited by Jesse on
Really cool thank you for this :)
Hi everyone,

Thank you for your work aolo2 !

I just wanted to point out that there is small issue with the implementation. Using the global node arena seems to make 4coder to use an ever increasing amount of memory as (I suppose) the allocations are never freed.

What I did to fix this problem is that I created a Scratch_Block at the beginning of the rendering function and I used this arena in the push_token_lexeme function.
Hello, Corentin!

Thank you for pointing out the issue. To be clear, the implemetation was more of a ProofOfConcept than anything else. Other flaws include:
- no colflict resolution between note types (solved by creating three tables instead of one, and resolving in order of macro > type > function)
- exact hash collisions are not handled at all (solved by adding an string compare in case of hash match)
- the table is not modified after creation, so separate chaining is definitely a wrong choice (its upside is easy deletion, which is not needed, and it's slow). Something like a 80% filled linear probing, or possibly a cache table with a small array for leftovers would be much better

Glad you liked it though! Also, I snooped around Ryans custom layer and it seems Allen has plans of adding index tables himself.
A small update.

Writing custom token types straight to token->kind appears to be a bad idea as it interferes with parsing logic (custom
types are preserved after buffer modification, so the parser doesn't recognize declarations anymore). Adding a user_kind field to token type and checking it separately solved the issue.

Obligatory 4coder screenshot:
I created a module that does the same thing with minimal modifications of 4coder's code. I posted it on the wiki to make it easier to find:
MODULE Primitive highlighting. Any feedback is appreciated.
Looks great to me, you seem to have addressed all my points of concern. The benchmarks also look good.

This leaves only two improvements I think of:
  1. If this is getting the wiki treatment, it may be time to tighen the bolts on the parser: it's doesn't recognize anonymous struct or union typedefs as declarations, and those are common for C. Example:
    1
    2
    3
    typedef struct {
        int foo;
    } bar;
    

    I understand that filing an issue on Allen's github may be what we should do, but he doesn't seem to be active there in the last few weeks.
  2. The other problem is that struct field names can clash with user-defined types (and functions?). Example:
    1
    2
    3
    4
    struct s1 { int a; };
    struct s2 { int s1; };
    s1 a = {};
    a.s2 = 0;
    

    Here in a.s2 s2 will get highlighted as a type. What can we do to prevent this? Seems like an endgame feature tbh, because it requires a full compile (does it?). Is it possible to do a somewhat-full C++ parse in the given time budget? Say a 10k line file in a few ms.

    After thinking about it, we could just disable TYPE highlighting if last token was a dot.

You should fill an issue on github for 1. While Allen isn't always active on 4coder, he will most likely go through the issues at some point.

For 2, we could fix the type after a . or -> with the following code.
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
if ( note ) {

    switch ( note->note_kind ) {
        case CodeIndexNote_Type: {
            color = fcolor_id( primitive_highlight_type );
        } break;

        case CodeIndexNote_Function: {
            color = fcolor_id( primitive_highlight_function );
        } break;

        case CodeIndexNote_Macro: {
            color = fcolor_id( primitive_highlight_macro );
        } break;

        case CodeIndexNote_4coderCommand: {
            /* NOTE: 4coder doesn't create those notes as of 4.1.6. */
            color = fcolor_id( primitive_highlight_4coder_command );
        } break;
    }

    colored = true;               
#if 1
    if ( note->note_kind == CodeIndexNote_Type ) {

        Token_Iterator_Array dot_arrow_it = it;

        if ( token_it_dec_non_whitespace( &dot_arrow_it ) ) {

            Token* dot_arrow = token_it_read( &dot_arrow_it );

            if ( dot_arrow->kind == TokenBaseKind_Operator && ( dot_arrow->sub_kind == TokenCppKind_Dot || dot_arrow->sub_kind == TokenCppKind_Arrow ) ) {
                colored = false;
            }
        }
    }
#endif
}


But it wouldn't fix the following issue:
1
2
3
4
5
struct s1 { int a; };
struct s2 { int s1; }; /* <-- s1 is colored as a type. */

s2 a = { };
a.s1 = 0; /* <-- This would be fixed. */