Semi working scanner for html code
This commit is contained in:
		
							parent
							
								
									492f890283
								
							
						
					
					
						commit
						01ac6a36b6
					
				@ -1,21 +1,149 @@
 | 
			
		||||
const std = @import("std");
 | 
			
		||||
 | 
			
		||||
const Token = @import("token.zig").Token;
 | 
			
		||||
const Lexeme = @import("token.zig").Lexeme;
 | 
			
		||||
 | 
			
		||||
pub fn Scanner(comptime Source: type) type {
 | 
			
		||||
    return struct {
 | 
			
		||||
        const Self = @This();
 | 
			
		||||
        const SourceType = Source;
 | 
			
		||||
        line: u32,
 | 
			
		||||
        col: u32,
 | 
			
		||||
        current: u32,
 | 
			
		||||
pub const Scanner = struct {
 | 
			
		||||
    scanned: bool,
 | 
			
		||||
    line: u32,
 | 
			
		||||
    col: u32,
 | 
			
		||||
    current: u32,
 | 
			
		||||
    pos: u32,
 | 
			
		||||
 | 
			
		||||
        // fn scanArrayList(self: Self, source: SourceType, token_array: std.ArrayListAligned(Token, null)) !std.ArrayListAligned(Token, null) {
 | 
			
		||||
        //     _ = self;
 | 
			
		||||
        //     var eof = false;
 | 
			
		||||
        //     while (!eof) {
 | 
			
		||||
        //         // switch(
 | 
			
		||||
        //     }
 | 
			
		||||
        // }
 | 
			
		||||
    pub fn init() Scanner {
 | 
			
		||||
        return .{ .scanned = false, .line = 1, .col = 1, .current = 0, .pos = 0 };
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn scanSlice(self: *Scanner, source: []const u8, token_array: *std.ArrayListAligned(Token, null)) !void {
 | 
			
		||||
        var eof = false;
 | 
			
		||||
        blk: while (true) {
 | 
			
		||||
            eof = self.pos >= source.len;
 | 
			
		||||
            if (eof) {
 | 
			
		||||
                break :blk;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            self.current = 1;
 | 
			
		||||
            var char = source[self.pos];
 | 
			
		||||
            var lexeme: Lexeme =
 | 
			
		||||
                switch (char) {
 | 
			
		||||
                '<' => .LT,
 | 
			
		||||
                '>' => .GT,
 | 
			
		||||
                '\n' => .NEWLINE,
 | 
			
		||||
                '=' => .EQUALS,
 | 
			
		||||
                '\'' => .SINGLE_QUOTE,
 | 
			
		||||
                '"' => .DOUBLE_QOUTE,
 | 
			
		||||
                ' ' => lex: {
 | 
			
		||||
                    self.current = Lexeme.greedyCapture(.WHITESPACE, source, self.pos);
 | 
			
		||||
                    break :lex .WHITESPACE;
 | 
			
		||||
                },
 | 
			
		||||
                else => lex: {
 | 
			
		||||
                    if (match(source, "{{", self.pos)) {
 | 
			
		||||
                        self.current = 2;
 | 
			
		||||
                        break :lex .DOUBLE_OPEN_BRACE;
 | 
			
		||||
                    } else if (match(source, "}}", self.pos)) {
 | 
			
		||||
                        self.current = 2;
 | 
			
		||||
                        break :lex .DOUBLE_CLOSE_BRACE;
 | 
			
		||||
                    } else if (match(source, "/>", self.pos)) {
 | 
			
		||||
                        self.current = 2;
 | 
			
		||||
                        break :lex .GT_CLOSING;
 | 
			
		||||
                    } else {
 | 
			
		||||
                        std.debug.print("Capturing text at {d}...\n", .{self.pos});
 | 
			
		||||
                        // TODO use greedyCapture instead
 | 
			
		||||
                        self.current = captureText(source, self.pos);
 | 
			
		||||
                        std.debug.print("Captured {d} characters\n", .{self.current});
 | 
			
		||||
                        break :lex .TEXT;
 | 
			
		||||
                    }
 | 
			
		||||
                },
 | 
			
		||||
            };
 | 
			
		||||
            var raw = source[self.pos .. self.pos + self.current];
 | 
			
		||||
            self.pos += self.current;
 | 
			
		||||
 | 
			
		||||
            std.debug.print("{s}\n", .{raw});
 | 
			
		||||
            try token_array.append(Token{ .col = self.col, .lexeme = lexeme, .line = self.line, .raw = raw });
 | 
			
		||||
            self.col += self.current;
 | 
			
		||||
 | 
			
		||||
            if (lexeme == .NEWLINE) {
 | 
			
		||||
                self.line += 1;
 | 
			
		||||
                self.col = 1;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn match(source: []const u8, key: []const u8, pos: u32) bool {
 | 
			
		||||
        if (source.len < key.len) {
 | 
			
		||||
            return false;
 | 
			
		||||
        }
 | 
			
		||||
        var source_slice = source[pos .. pos + key.len];
 | 
			
		||||
        if (std.mem.eql(u8, source_slice, key)) {
 | 
			
		||||
            return true;
 | 
			
		||||
        }
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Consumes all tokens of type `lexeme`
 | 
			
		||||
    /// Returns the position that the capture ends
 | 
			
		||||
    fn captureText(source: []const u8, pos: u32) u32 {
 | 
			
		||||
        var current: u32 = 1;
 | 
			
		||||
        var eof = false;
 | 
			
		||||
        blk: while (true) {
 | 
			
		||||
            eof = pos + current >= source.len;
 | 
			
		||||
            if (eof) {
 | 
			
		||||
                break :blk;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            var char = source[pos + current];
 | 
			
		||||
            switch (char) {
 | 
			
		||||
                '<', '>', '\n', '=', '\'', '"', ' ' => break :blk,
 | 
			
		||||
                '{' => {
 | 
			
		||||
                    if (match(source, "{{", pos + current)) {
 | 
			
		||||
                        break :blk;
 | 
			
		||||
                    }
 | 
			
		||||
                },
 | 
			
		||||
                '}' => {
 | 
			
		||||
                    if (match(source, "}}", pos + current)) {
 | 
			
		||||
                        break :blk;
 | 
			
		||||
                    }
 | 
			
		||||
                },
 | 
			
		||||
                '/' => {
 | 
			
		||||
                    if (match(source, "/>", pos + current)) {
 | 
			
		||||
                        break :blk;
 | 
			
		||||
                    }
 | 
			
		||||
                },
 | 
			
		||||
                else => {},
 | 
			
		||||
            }
 | 
			
		||||
            current += 1;
 | 
			
		||||
        }
 | 
			
		||||
        return current;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn capture(source: []const u8, lexeme: Lexeme, pos: u32) u32 {
 | 
			
		||||
        _ = pos;
 | 
			
		||||
        _ = lexeme;
 | 
			
		||||
        _ = source;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn scanSliceAlloc(self: *Scanner, source: []const u8, allocator: std.mem.Allocator) !std.ArrayListAligned(Token, null) {
 | 
			
		||||
        _ = allocator;
 | 
			
		||||
        _ = source;
 | 
			
		||||
        _ = self;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
test "Scanner Test" {
 | 
			
		||||
    const html =
 | 
			
		||||
        \\      <a href='https://google.com'/>
 | 
			
		||||
        \\ <p>Another element</p>
 | 
			
		||||
        \\ <h1>{{zig_value}}</h1>
 | 
			
		||||
    ;
 | 
			
		||||
 | 
			
		||||
    var scanner = Scanner.init();
 | 
			
		||||
    var arr = std.ArrayListAligned(Token, null).init(std.testing.allocator);
 | 
			
		||||
    defer arr.deinit();
 | 
			
		||||
    scanner.scanSlice(html, &arr) catch |err| {
 | 
			
		||||
        std.debug.print("Got error: {any}", .{err});
 | 
			
		||||
        return error.Unexpected;
 | 
			
		||||
    };
 | 
			
		||||
} 
 | 
			
		||||
    for (arr.items) |item| {
 | 
			
		||||
        std.debug.print("{d}:{d}\t{any}\n\t\t{s}\n", .{ item.line, item.col, item.lexeme, item.raw });
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -8,19 +8,99 @@ pub const Token = struct {
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
pub const Lexeme = enum {
 | 
			
		||||
    NEWLINE,
 | 
			
		||||
    LT,
 | 
			
		||||
    GT,
 | 
			
		||||
    CLOSING_LT,
 | 
			
		||||
    EQUALS,
 | 
			
		||||
    TAG,
 | 
			
		||||
	SINGLE_QUOTE,
 | 
			
		||||
	DOUBLE_QOUTE,
 | 
			
		||||
	TEXT,
 | 
			
		||||
	DOUBLE_OPEN_BRACE,
 | 
			
		||||
	DOUBLE_CLOSE_BRACE,
 | 
			
		||||
    NEWLINE, // \n
 | 
			
		||||
    WHITESPACE, // any whitespace
 | 
			
		||||
    LT, // <
 | 
			
		||||
    GT_CLOSING, // />
 | 
			
		||||
    GT, // >
 | 
			
		||||
    EQUALS, // =
 | 
			
		||||
    SINGLE_QUOTE, // '
 | 
			
		||||
    DOUBLE_QOUTE, // "
 | 
			
		||||
    TEXT, // any text
 | 
			
		||||
    DOUBLE_OPEN_BRACE, // {{
 | 
			
		||||
    DOUBLE_CLOSE_BRACE, // }}
 | 
			
		||||
    BACKSLASH, // /
 | 
			
		||||
 | 
			
		||||
    pub fn greedyCapture(lexeme: Lexeme, source: []const u8, pos: u32) u32 {
 | 
			
		||||
        var current: u32 = 1;
 | 
			
		||||
        var eof = false;
 | 
			
		||||
        switch (lexeme) {
 | 
			
		||||
            .WHITESPACE => {
 | 
			
		||||
                blk: while (true) {
 | 
			
		||||
                    eof = pos + current >= source.len;
 | 
			
		||||
                    if (eof) {
 | 
			
		||||
                        break :blk;
 | 
			
		||||
                    }
 | 
			
		||||
 | 
			
		||||
                    var char = source[pos + current];
 | 
			
		||||
                    switch (char) {
 | 
			
		||||
                        ' ' => {},
 | 
			
		||||
                        else => break :blk,
 | 
			
		||||
                    }
 | 
			
		||||
                    current += 1;
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            .TEXT => {
 | 
			
		||||
                blk: while (true) {
 | 
			
		||||
                    eof = pos + current >= source.len;
 | 
			
		||||
                    if (eof) {
 | 
			
		||||
                        break :blk;
 | 
			
		||||
                    }
 | 
			
		||||
 | 
			
		||||
                    var char = source[pos + current];
 | 
			
		||||
                    switch (char) {
 | 
			
		||||
                        '<', '>', '\n', '=', '\'', '"', ' ' => break :blk,
 | 
			
		||||
                        '{' => {
 | 
			
		||||
                            if (match(source, "{{", pos + current)) {
 | 
			
		||||
                                break :blk;
 | 
			
		||||
                            }
 | 
			
		||||
                        },
 | 
			
		||||
                        '}' => {
 | 
			
		||||
                            if (match(source, "}}", pos + current)) {
 | 
			
		||||
                                break :blk;
 | 
			
		||||
                            }
 | 
			
		||||
                        },
 | 
			
		||||
                        '/' => {
 | 
			
		||||
                            if (match(source, "/>", pos + current)) {
 | 
			
		||||
                                break :blk;
 | 
			
		||||
                            }
 | 
			
		||||
                        },
 | 
			
		||||
                        else => {},
 | 
			
		||||
                    }
 | 
			
		||||
                    current += 1;
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            else => unreachable,
 | 
			
		||||
        }
 | 
			
		||||
        return current;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn match(source: []const u8, key: []const u8, pos: u32) bool {
 | 
			
		||||
        if (source.len < key.len) {
 | 
			
		||||
            return false;
 | 
			
		||||
        }
 | 
			
		||||
        var source_slice = source[pos .. pos + key.len];
 | 
			
		||||
        if (std.mem.eql(u8, source_slice, key)) {
 | 
			
		||||
            return true;
 | 
			
		||||
        }
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// const LexemeChar = union(Lexeme) {
 | 
			
		||||
//     NEWLINE: u8, // \n
 | 
			
		||||
//     LT: u8, // <
 | 
			
		||||
//     GT_CLOSING: u8, // />
 | 
			
		||||
//     GT: u8, // >
 | 
			
		||||
//     EQUALS: u8, // =
 | 
			
		||||
//     SINGLE_QUOTE: u8, // '
 | 
			
		||||
//     DOUBLE_QOUTE: u8, // "
 | 
			
		||||
//     TEXT: u8, // any text
 | 
			
		||||
//     DOUBLE_OPEN_BRACE: u8, // {{
 | 
			
		||||
//     DOUBLE_CLOSE_BRACE: u8, // }}
 | 
			
		||||
//     BACKSLASH: u8, // /
 | 
			
		||||
// };
 | 
			
		||||
 | 
			
		||||
pub const TagTypes = enum {
 | 
			
		||||
    a,
 | 
			
		||||
    abbr,
 | 
			
		||||
@ -67,16 +147,5 @@ pub const TagTypes = enum {
 | 
			
		||||
    video,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
pub const TagAttributes = enum {
 | 
			
		||||
    href,
 | 
			
		||||
    alt,
 | 
			
		||||
    border,
 | 
			
		||||
    name,
 | 
			
		||||
    src,
 | 
			
		||||
    style,
 | 
			
		||||
    class,
 | 
			
		||||
    maxlength,
 | 
			
		||||
    name,
 | 
			
		||||
    onblur
 | 
			
		||||
};
 | 
			
		||||
pub const TagAttributes = enum { href, alt, border, name, src, style, class, maxlength, name, onblur };
 | 
			
		||||
// <h3 class="gogo">text</h3>
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user