diff --git a/src/htzx/html_parser/scanner.zig b/src/htzx/html_parser/scanner.zig index e4eb5c4..8572e43 100644 --- a/src/htzx/html_parser/scanner.zig +++ b/src/htzx/html_parser/scanner.zig @@ -1,21 +1,149 @@ const std = @import("std"); const Token = @import("token.zig").Token; +const Lexeme = @import("token.zig").Lexeme; -pub fn Scanner(comptime Source: type) type { - return struct { - const Self = @This(); - const SourceType = Source; - line: u32, - col: u32, - current: u32, +pub const Scanner = struct { + scanned: bool, + line: u32, + col: u32, + current: u32, + pos: u32, - // fn scanArrayList(self: Self, source: SourceType, token_array: std.ArrayListAligned(Token, null)) !std.ArrayListAligned(Token, null) { - // _ = self; - // var eof = false; - // while (!eof) { - // // switch( - // } - // } + pub fn init() Scanner { + return .{ .scanned = false, .line = 1, .col = 1, .current = 0, .pos = 0 }; + } + + fn scanSlice(self: *Scanner, source: []const u8, token_array: *std.ArrayListAligned(Token, null)) !void { + var eof = false; + blk: while (true) { + eof = self.pos >= source.len; + if (eof) { + break :blk; + } + + self.current = 1; + var char = source[self.pos]; + var lexeme: Lexeme = + switch (char) { + '<' => .LT, + '>' => .GT, + '\n' => .NEWLINE, + '=' => .EQUALS, + '\'' => .SINGLE_QUOTE, + '"' => .DOUBLE_QOUTE, + ' ' => lex: { + self.current = Lexeme.greedyCapture(.WHITESPACE, source, self.pos); + break :lex .WHITESPACE; + }, + else => lex: { + if (match(source, "{{", self.pos)) { + self.current = 2; + break :lex .DOUBLE_OPEN_BRACE; + } else if (match(source, "}}", self.pos)) { + self.current = 2; + break :lex .DOUBLE_CLOSE_BRACE; + } else if (match(source, "/>", self.pos)) { + self.current = 2; + break :lex .GT_CLOSING; + } else { + std.debug.print("Capturing text at {d}...\n", .{self.pos}); + // TODO use greedyCapture instead + self.current = captureText(source, self.pos); + std.debug.print("Captured {d} characters\n", .{self.current}); + break :lex .TEXT; + } + }, + }; + var raw = source[self.pos .. self.pos + self.current]; + self.pos += self.current; + + std.debug.print("{s}\n", .{raw}); + try token_array.append(Token{ .col = self.col, .lexeme = lexeme, .line = self.line, .raw = raw }); + self.col += self.current; + + if (lexeme == .NEWLINE) { + self.line += 1; + self.col = 1; + } + } + } + + fn match(source: []const u8, key: []const u8, pos: u32) bool { + if (source.len < key.len) { + return false; + } + var source_slice = source[pos .. pos + key.len]; + if (std.mem.eql(u8, source_slice, key)) { + return true; + } + return false; + } + + /// Consumes all tokens of type `lexeme` + /// Returns the position that the capture ends + fn captureText(source: []const u8, pos: u32) u32 { + var current: u32 = 1; + var eof = false; + blk: while (true) { + eof = pos + current >= source.len; + if (eof) { + break :blk; + } + + var char = source[pos + current]; + switch (char) { + '<', '>', '\n', '=', '\'', '"', ' ' => break :blk, + '{' => { + if (match(source, "{{", pos + current)) { + break :blk; + } + }, + '}' => { + if (match(source, "}}", pos + current)) { + break :blk; + } + }, + '/' => { + if (match(source, "/>", pos + current)) { + break :blk; + } + }, + else => {}, + } + current += 1; + } + return current; + } + + fn capture(source: []const u8, lexeme: Lexeme, pos: u32) u32 { + _ = pos; + _ = lexeme; + _ = source; + } + + fn scanSliceAlloc(self: *Scanner, source: []const u8, allocator: std.mem.Allocator) !std.ArrayListAligned(Token, null) { + _ = allocator; + _ = source; + _ = self; + } +}; + +test "Scanner Test" { + const html = + \\ + \\
Another element
+ \\