Semi working scanner for html code

This commit is contained in:
Nathan Anderson 2023-11-15 22:53:31 -07:00
parent 492f890283
commit 01ac6a36b6
2 changed files with 235 additions and 38 deletions

View File

@ -1,21 +1,149 @@
const std = @import("std"); const std = @import("std");
const Token = @import("token.zig").Token; const Token = @import("token.zig").Token;
const Lexeme = @import("token.zig").Lexeme;
pub fn Scanner(comptime Source: type) type { pub const Scanner = struct {
return struct { scanned: bool,
const Self = @This(); line: u32,
const SourceType = Source; col: u32,
line: u32, current: u32,
col: u32, pos: u32,
current: u32,
// fn scanArrayList(self: Self, source: SourceType, token_array: std.ArrayListAligned(Token, null)) !std.ArrayListAligned(Token, null) { pub fn init() Scanner {
// _ = self; return .{ .scanned = false, .line = 1, .col = 1, .current = 0, .pos = 0 };
// var eof = false; }
// while (!eof) {
// // switch( fn scanSlice(self: *Scanner, source: []const u8, token_array: *std.ArrayListAligned(Token, null)) !void {
// } var eof = false;
// } blk: while (true) {
eof = self.pos >= source.len;
if (eof) {
break :blk;
}
self.current = 1;
var char = source[self.pos];
var lexeme: Lexeme =
switch (char) {
'<' => .LT,
'>' => .GT,
'\n' => .NEWLINE,
'=' => .EQUALS,
'\'' => .SINGLE_QUOTE,
'"' => .DOUBLE_QOUTE,
' ' => lex: {
self.current = Lexeme.greedyCapture(.WHITESPACE, source, self.pos);
break :lex .WHITESPACE;
},
else => lex: {
if (match(source, "{{", self.pos)) {
self.current = 2;
break :lex .DOUBLE_OPEN_BRACE;
} else if (match(source, "}}", self.pos)) {
self.current = 2;
break :lex .DOUBLE_CLOSE_BRACE;
} else if (match(source, "/>", self.pos)) {
self.current = 2;
break :lex .GT_CLOSING;
} else {
std.debug.print("Capturing text at {d}...\n", .{self.pos});
// TODO use greedyCapture instead
self.current = captureText(source, self.pos);
std.debug.print("Captured {d} characters\n", .{self.current});
break :lex .TEXT;
}
},
};
var raw = source[self.pos .. self.pos + self.current];
self.pos += self.current;
std.debug.print("{s}\n", .{raw});
try token_array.append(Token{ .col = self.col, .lexeme = lexeme, .line = self.line, .raw = raw });
self.col += self.current;
if (lexeme == .NEWLINE) {
self.line += 1;
self.col = 1;
}
}
}
fn match(source: []const u8, key: []const u8, pos: u32) bool {
if (source.len < key.len) {
return false;
}
var source_slice = source[pos .. pos + key.len];
if (std.mem.eql(u8, source_slice, key)) {
return true;
}
return false;
}
/// Consumes all tokens of type `lexeme`
/// Returns the position that the capture ends
fn captureText(source: []const u8, pos: u32) u32 {
var current: u32 = 1;
var eof = false;
blk: while (true) {
eof = pos + current >= source.len;
if (eof) {
break :blk;
}
var char = source[pos + current];
switch (char) {
'<', '>', '\n', '=', '\'', '"', ' ' => break :blk,
'{' => {
if (match(source, "{{", pos + current)) {
break :blk;
}
},
'}' => {
if (match(source, "}}", pos + current)) {
break :blk;
}
},
'/' => {
if (match(source, "/>", pos + current)) {
break :blk;
}
},
else => {},
}
current += 1;
}
return current;
}
fn capture(source: []const u8, lexeme: Lexeme, pos: u32) u32 {
_ = pos;
_ = lexeme;
_ = source;
}
fn scanSliceAlloc(self: *Scanner, source: []const u8, allocator: std.mem.Allocator) !std.ArrayListAligned(Token, null) {
_ = allocator;
_ = source;
_ = self;
}
};
test "Scanner Test" {
const html =
\\ <a href='https://google.com'/>
\\ <p>Another element</p>
\\ <h1>{{zig_value}}</h1>
;
var scanner = Scanner.init();
var arr = std.ArrayListAligned(Token, null).init(std.testing.allocator);
defer arr.deinit();
scanner.scanSlice(html, &arr) catch |err| {
std.debug.print("Got error: {any}", .{err});
return error.Unexpected;
}; };
for (arr.items) |item| {
std.debug.print("{d}:{d}\t{any}\n\t\t{s}\n", .{ item.line, item.col, item.lexeme, item.raw });
}
} }

View File

@ -8,19 +8,99 @@ pub const Token = struct {
}; };
pub const Lexeme = enum { pub const Lexeme = enum {
NEWLINE, NEWLINE, // \n
LT, WHITESPACE, // any whitespace
GT, LT, // <
CLOSING_LT, GT_CLOSING, // />
EQUALS, GT, // >
TAG, EQUALS, // =
SINGLE_QUOTE, SINGLE_QUOTE, // '
DOUBLE_QOUTE, DOUBLE_QOUTE, // "
TEXT, TEXT, // any text
DOUBLE_OPEN_BRACE, DOUBLE_OPEN_BRACE, // {{
DOUBLE_CLOSE_BRACE, DOUBLE_CLOSE_BRACE, // }}
BACKSLASH, // /
pub fn greedyCapture(lexeme: Lexeme, source: []const u8, pos: u32) u32 {
var current: u32 = 1;
var eof = false;
switch (lexeme) {
.WHITESPACE => {
blk: while (true) {
eof = pos + current >= source.len;
if (eof) {
break :blk;
}
var char = source[pos + current];
switch (char) {
' ' => {},
else => break :blk,
}
current += 1;
}
},
.TEXT => {
blk: while (true) {
eof = pos + current >= source.len;
if (eof) {
break :blk;
}
var char = source[pos + current];
switch (char) {
'<', '>', '\n', '=', '\'', '"', ' ' => break :blk,
'{' => {
if (match(source, "{{", pos + current)) {
break :blk;
}
},
'}' => {
if (match(source, "}}", pos + current)) {
break :blk;
}
},
'/' => {
if (match(source, "/>", pos + current)) {
break :blk;
}
},
else => {},
}
current += 1;
}
},
else => unreachable,
}
return current;
}
fn match(source: []const u8, key: []const u8, pos: u32) bool {
if (source.len < key.len) {
return false;
}
var source_slice = source[pos .. pos + key.len];
if (std.mem.eql(u8, source_slice, key)) {
return true;
}
return false;
}
}; };
// const LexemeChar = union(Lexeme) {
// NEWLINE: u8, // \n
// LT: u8, // <
// GT_CLOSING: u8, // />
// GT: u8, // >
// EQUALS: u8, // =
// SINGLE_QUOTE: u8, // '
// DOUBLE_QOUTE: u8, // "
// TEXT: u8, // any text
// DOUBLE_OPEN_BRACE: u8, // {{
// DOUBLE_CLOSE_BRACE: u8, // }}
// BACKSLASH: u8, // /
// };
pub const TagTypes = enum { pub const TagTypes = enum {
a, a,
abbr, abbr,
@ -67,16 +147,5 @@ pub const TagTypes = enum {
video, video,
}; };
pub const TagAttributes = enum { pub const TagAttributes = enum { href, alt, border, name, src, style, class, maxlength, name, onblur };
href,
alt,
border,
name,
src,
style,
class,
maxlength,
name,
onblur
};
// <h3 class="gogo">text</h3> // <h3 class="gogo">text</h3>