From 01ac6a36b6c34c54cfd1097b48c7ffc2d13c3d1d Mon Sep 17 00:00:00 2001 From: Nathan Anderson Date: Wed, 15 Nov 2023 22:53:31 -0700 Subject: [PATCH] Semi working scanner for html code --- src/htzx/html_parser/scanner.zig | 158 ++++++++++++++++++++++++++++--- src/htzx/html_parser/token.zig | 115 +++++++++++++++++----- 2 files changed, 235 insertions(+), 38 deletions(-) diff --git a/src/htzx/html_parser/scanner.zig b/src/htzx/html_parser/scanner.zig index e4eb5c4..8572e43 100644 --- a/src/htzx/html_parser/scanner.zig +++ b/src/htzx/html_parser/scanner.zig @@ -1,21 +1,149 @@ const std = @import("std"); const Token = @import("token.zig").Token; +const Lexeme = @import("token.zig").Lexeme; -pub fn Scanner(comptime Source: type) type { - return struct { - const Self = @This(); - const SourceType = Source; - line: u32, - col: u32, - current: u32, +pub const Scanner = struct { + scanned: bool, + line: u32, + col: u32, + current: u32, + pos: u32, - // fn scanArrayList(self: Self, source: SourceType, token_array: std.ArrayListAligned(Token, null)) !std.ArrayListAligned(Token, null) { - // _ = self; - // var eof = false; - // while (!eof) { - // // switch( - // } - // } + pub fn init() Scanner { + return .{ .scanned = false, .line = 1, .col = 1, .current = 0, .pos = 0 }; + } + + fn scanSlice(self: *Scanner, source: []const u8, token_array: *std.ArrayListAligned(Token, null)) !void { + var eof = false; + blk: while (true) { + eof = self.pos >= source.len; + if (eof) { + break :blk; + } + + self.current = 1; + var char = source[self.pos]; + var lexeme: Lexeme = + switch (char) { + '<' => .LT, + '>' => .GT, + '\n' => .NEWLINE, + '=' => .EQUALS, + '\'' => .SINGLE_QUOTE, + '"' => .DOUBLE_QOUTE, + ' ' => lex: { + self.current = Lexeme.greedyCapture(.WHITESPACE, source, self.pos); + break :lex .WHITESPACE; + }, + else => lex: { + if (match(source, "{{", self.pos)) { + self.current = 2; + break :lex .DOUBLE_OPEN_BRACE; + } else if (match(source, "}}", self.pos)) { + self.current = 2; + break :lex .DOUBLE_CLOSE_BRACE; + } else if (match(source, "/>", self.pos)) { + self.current = 2; + break :lex .GT_CLOSING; + } else { + std.debug.print("Capturing text at {d}...\n", .{self.pos}); + // TODO use greedyCapture instead + self.current = captureText(source, self.pos); + std.debug.print("Captured {d} characters\n", .{self.current}); + break :lex .TEXT; + } + }, + }; + var raw = source[self.pos .. self.pos + self.current]; + self.pos += self.current; + + std.debug.print("{s}\n", .{raw}); + try token_array.append(Token{ .col = self.col, .lexeme = lexeme, .line = self.line, .raw = raw }); + self.col += self.current; + + if (lexeme == .NEWLINE) { + self.line += 1; + self.col = 1; + } + } + } + + fn match(source: []const u8, key: []const u8, pos: u32) bool { + if (source.len < key.len) { + return false; + } + var source_slice = source[pos .. pos + key.len]; + if (std.mem.eql(u8, source_slice, key)) { + return true; + } + return false; + } + + /// Consumes all tokens of type `lexeme` + /// Returns the position that the capture ends + fn captureText(source: []const u8, pos: u32) u32 { + var current: u32 = 1; + var eof = false; + blk: while (true) { + eof = pos + current >= source.len; + if (eof) { + break :blk; + } + + var char = source[pos + current]; + switch (char) { + '<', '>', '\n', '=', '\'', '"', ' ' => break :blk, + '{' => { + if (match(source, "{{", pos + current)) { + break :blk; + } + }, + '}' => { + if (match(source, "}}", pos + current)) { + break :blk; + } + }, + '/' => { + if (match(source, "/>", pos + current)) { + break :blk; + } + }, + else => {}, + } + current += 1; + } + return current; + } + + fn capture(source: []const u8, lexeme: Lexeme, pos: u32) u32 { + _ = pos; + _ = lexeme; + _ = source; + } + + fn scanSliceAlloc(self: *Scanner, source: []const u8, allocator: std.mem.Allocator) !std.ArrayListAligned(Token, null) { + _ = allocator; + _ = source; + _ = self; + } +}; + +test "Scanner Test" { + const html = + \\ + \\

Another element

+ \\

{{zig_value}}

+ ; + + var scanner = Scanner.init(); + var arr = std.ArrayListAligned(Token, null).init(std.testing.allocator); + defer arr.deinit(); + scanner.scanSlice(html, &arr) catch |err| { + std.debug.print("Got error: {any}", .{err}); + return error.Unexpected; }; -} + for (arr.items) |item| { + std.debug.print("{d}:{d}\t{any}\n\t\t{s}\n", .{ item.line, item.col, item.lexeme, item.raw }); + } +} diff --git a/src/htzx/html_parser/token.zig b/src/htzx/html_parser/token.zig index a9edf68..7d1109b 100644 --- a/src/htzx/html_parser/token.zig +++ b/src/htzx/html_parser/token.zig @@ -8,19 +8,99 @@ pub const Token = struct { }; pub const Lexeme = enum { - NEWLINE, - LT, - GT, - CLOSING_LT, - EQUALS, - TAG, - SINGLE_QUOTE, - DOUBLE_QOUTE, - TEXT, - DOUBLE_OPEN_BRACE, - DOUBLE_CLOSE_BRACE, + NEWLINE, // \n + WHITESPACE, // any whitespace + LT, // < + GT_CLOSING, // /> + GT, // > + EQUALS, // = + SINGLE_QUOTE, // ' + DOUBLE_QOUTE, // " + TEXT, // any text + DOUBLE_OPEN_BRACE, // {{ + DOUBLE_CLOSE_BRACE, // }} + BACKSLASH, // / + + pub fn greedyCapture(lexeme: Lexeme, source: []const u8, pos: u32) u32 { + var current: u32 = 1; + var eof = false; + switch (lexeme) { + .WHITESPACE => { + blk: while (true) { + eof = pos + current >= source.len; + if (eof) { + break :blk; + } + + var char = source[pos + current]; + switch (char) { + ' ' => {}, + else => break :blk, + } + current += 1; + } + }, + .TEXT => { + blk: while (true) { + eof = pos + current >= source.len; + if (eof) { + break :blk; + } + + var char = source[pos + current]; + switch (char) { + '<', '>', '\n', '=', '\'', '"', ' ' => break :blk, + '{' => { + if (match(source, "{{", pos + current)) { + break :blk; + } + }, + '}' => { + if (match(source, "}}", pos + current)) { + break :blk; + } + }, + '/' => { + if (match(source, "/>", pos + current)) { + break :blk; + } + }, + else => {}, + } + current += 1; + } + }, + else => unreachable, + } + return current; + } + + fn match(source: []const u8, key: []const u8, pos: u32) bool { + if (source.len < key.len) { + return false; + } + var source_slice = source[pos .. pos + key.len]; + if (std.mem.eql(u8, source_slice, key)) { + return true; + } + return false; + } }; +// const LexemeChar = union(Lexeme) { +// NEWLINE: u8, // \n +// LT: u8, // < +// GT_CLOSING: u8, // /> +// GT: u8, // > +// EQUALS: u8, // = +// SINGLE_QUOTE: u8, // ' +// DOUBLE_QOUTE: u8, // " +// TEXT: u8, // any text +// DOUBLE_OPEN_BRACE: u8, // {{ +// DOUBLE_CLOSE_BRACE: u8, // }} +// BACKSLASH: u8, // / +// }; + pub const TagTypes = enum { a, abbr, @@ -67,16 +147,5 @@ pub const TagTypes = enum { video, }; -pub const TagAttributes = enum { - href, - alt, - border, - name, - src, - style, - class, - maxlength, - name, - onblur -}; +pub const TagAttributes = enum { href, alt, border, name, src, style, class, maxlength, name, onblur }; //

text