Semi working scanner for html code

2023-11-15 22:53:31 -07:00 · 2023-11-15 22:53:31 -07:00 · 01ac6a36b6
commit 01ac6a36b6
parent 492f890283
2 changed files with 235 additions and 38 deletions
--- a/src/htzx/html_parser/scanner.zig
+++ b/src/htzx/html_parser/scanner.zig
@ -1,21 +1,149 @@
 const std = @import("std");
 const Token = @import("token.zig").Token;
 const Lexeme = @import("token.zig").Lexeme;
-pub fn Scanner(comptime Source: type) type {
+pub const Scanner = struct {
-    return struct {
+    scanned: bool,
-        const Self = @This();
+    line: u32,
-        const SourceType = Source;
+    col: u32,
-        line: u32,
+    current: u32,
-        col: u32,
+    pos: u32,
        current: u32,
-        // fn scanArrayList(self: Self, source: SourceType, token_array: std.ArrayListAligned(Token, null)) !std.ArrayListAligned(Token, null) {
+    pub fn init() Scanner {
-        //     _ = self;
+        return .{ .scanned = false, .line = 1, .col = 1, .current = 0, .pos = 0 };
-        //     var eof = false;
+    }
-        //     while (!eof) {
+
-        //         // switch(
+    fn scanSlice(self: *Scanner, source: []const u8, token_array: *std.ArrayListAligned(Token, null)) !void {
-        //     }
+        var eof = false;
-        // }
+        blk: while (true) {
            eof = self.pos >= source.len;
            if (eof) {
                break :blk;
            }
            self.current = 1;
            var char = source[self.pos];
            var lexeme: Lexeme =
                switch (char) {
                '<' => .LT,
                '>' => .GT,
                '\n' => .NEWLINE,
                '=' => .EQUALS,
                '\'' => .SINGLE_QUOTE,
                '"' => .DOUBLE_QOUTE,
                ' ' => lex: {
                    self.current = Lexeme.greedyCapture(.WHITESPACE, source, self.pos);
                    break :lex .WHITESPACE;
                },
                else => lex: {
                    if (match(source, "{{", self.pos)) {
                        self.current = 2;
                        break :lex .DOUBLE_OPEN_BRACE;
                    } else if (match(source, "}}", self.pos)) {
                        self.current = 2;
                        break :lex .DOUBLE_CLOSE_BRACE;
                    } else if (match(source, "/>", self.pos)) {
                        self.current = 2;
                        break :lex .GT_CLOSING;
                    } else {
                        std.debug.print("Capturing text at {d}...\n", .{self.pos});
                        // TODO use greedyCapture instead
                        self.current = captureText(source, self.pos);
                        std.debug.print("Captured {d} characters\n", .{self.current});
                        break :lex .TEXT;
                    }
                },
            };
            var raw = source[self.pos .. self.pos + self.current];
            self.pos += self.current;
            std.debug.print("{s}\n", .{raw});
            try token_array.append(Token{ .col = self.col, .lexeme = lexeme, .line = self.line, .raw = raw });
            self.col += self.current;
            if (lexeme == .NEWLINE) {
                self.line += 1;
                self.col = 1;
            }
        }
    }
    fn match(source: []const u8, key: []const u8, pos: u32) bool {
        if (source.len < key.len) {
            return false;
        }
        var source_slice = source[pos .. pos + key.len];
        if (std.mem.eql(u8, source_slice, key)) {
            return true;
        }
        return false;
    }
    /// Consumes all tokens of type `lexeme`
    /// Returns the position that the capture ends
    fn captureText(source: []const u8, pos: u32) u32 {
        var current: u32 = 1;
        var eof = false;
        blk: while (true) {
            eof = pos + current >= source.len;
            if (eof) {
                break :blk;
            }
            var char = source[pos + current];
            switch (char) {
                '<', '>', '\n', '=', '\'', '"', ' ' => break :blk,
                '{' => {
                    if (match(source, "{{", pos + current)) {
                        break :blk;
                    }
                },
                '}' => {
                    if (match(source, "}}", pos + current)) {
                        break :blk;
                    }
                },
                '/' => {
                    if (match(source, "/>", pos + current)) {
                        break :blk;
                    }
                },
                else => {},
            }
            current += 1;
        }
        return current;
    }
    fn capture(source: []const u8, lexeme: Lexeme, pos: u32) u32 {
        _ = pos;
        _ = lexeme;
        _ = source;
    }
    fn scanSliceAlloc(self: *Scanner, source: []const u8, allocator: std.mem.Allocator) !std.ArrayListAligned(Token, null) {
        _ = allocator;
        _ = source;
        _ = self;
    }
 };
 test "Scanner Test" {
    const html =
        \\      <a href='https://google.com'/>
        \\ <p>Another element</p>
        \\ <h1>{{zig_value}}</h1>
    ;
    var scanner = Scanner.init();
    var arr = std.ArrayListAligned(Token, null).init(std.testing.allocator);
    defer arr.deinit();
    scanner.scanSlice(html, &arr) catch |err| {
        std.debug.print("Got error: {any}", .{err});
        return error.Unexpected;
    };
    for (arr.items) |item| {
        std.debug.print("{d}:{d}\t{any}\n\t\t{s}\n", .{ item.line, item.col, item.lexeme, item.raw });
    }
 }
--- a/src/htzx/html_parser/token.zig
+++ b/src/htzx/html_parser/token.zig
@ -8,19 +8,99 @@ pub const Token = struct {
 };
 pub const Lexeme = enum {
-    NEWLINE,
+    NEWLINE, // \n
-    LT,
+    WHITESPACE, // any whitespace
-    GT,
+    LT, // <
-    CLOSING_LT,
+    GT_CLOSING, // />
-    EQUALS,
+    GT, // >
-    TAG,
+    EQUALS, // =
-	SINGLE_QUOTE,
+    SINGLE_QUOTE, // '
-	DOUBLE_QOUTE,
+    DOUBLE_QOUTE, // "
-	TEXT,
+    TEXT, // any text
-	DOUBLE_OPEN_BRACE,
+    DOUBLE_OPEN_BRACE, // {{
-	DOUBLE_CLOSE_BRACE,
+    DOUBLE_CLOSE_BRACE, // }}
    BACKSLASH, // /
    pub fn greedyCapture(lexeme: Lexeme, source: []const u8, pos: u32) u32 {
        var current: u32 = 1;
        var eof = false;
        switch (lexeme) {
            .WHITESPACE => {
                blk: while (true) {
                    eof = pos + current >= source.len;
                    if (eof) {
                        break :blk;
                    }
                    var char = source[pos + current];
                    switch (char) {
                        ' ' => {},
                        else => break :blk,
                    }
                    current += 1;
                }
            },
            .TEXT => {
                blk: while (true) {
                    eof = pos + current >= source.len;
                    if (eof) {
                        break :blk;
                    }
                    var char = source[pos + current];
                    switch (char) {
                        '<', '>', '\n', '=', '\'', '"', ' ' => break :blk,
                        '{' => {
                            if (match(source, "{{", pos + current)) {
                                break :blk;
                            }
                        },
                        '}' => {
                            if (match(source, "}}", pos + current)) {
                                break :blk;
                            }
                        },
                        '/' => {
                            if (match(source, "/>", pos + current)) {
                                break :blk;
                            }
                        },
                        else => {},
                    }
                    current += 1;
                }
            },
            else => unreachable,
        }
        return current;
    }
    fn match(source: []const u8, key: []const u8, pos: u32) bool {
        if (source.len < key.len) {
            return false;
        }
        var source_slice = source[pos .. pos + key.len];
        if (std.mem.eql(u8, source_slice, key)) {
            return true;
        }
        return false;
    }
 };
 // const LexemeChar = union(Lexeme) {
 //     NEWLINE: u8, // \n
 //     LT: u8, // <
 //     GT_CLOSING: u8, // />
 //     GT: u8, // >
 //     EQUALS: u8, // =
 //     SINGLE_QUOTE: u8, // '
 //     DOUBLE_QOUTE: u8, // "
 //     TEXT: u8, // any text
 //     DOUBLE_OPEN_BRACE: u8, // {{
 //     DOUBLE_CLOSE_BRACE: u8, // }}
 //     BACKSLASH: u8, // /
 // };
 pub const TagTypes = enum {
    a,
    abbr,
@ -67,16 +147,5 @@ pub const TagTypes = enum {
    video,
 };
-pub const TagAttributes = enum {
+pub const TagAttributes = enum { href, alt, border, name, src, style, class, maxlength, name, onblur };
    href,
    alt,
    border,
    name,
    src,
    style,
    class,
    maxlength,
    name,
    onblur
 };
 // <h3 class="gogo">text</h3>