Semi working scanner for html code
This commit is contained in:
parent
492f890283
commit
01ac6a36b6
|
@ -1,21 +1,149 @@
|
|||
const std = @import("std");
|
||||
|
||||
const Token = @import("token.zig").Token;
|
||||
const Lexeme = @import("token.zig").Lexeme;
|
||||
|
||||
pub fn Scanner(comptime Source: type) type {
|
||||
return struct {
|
||||
const Self = @This();
|
||||
const SourceType = Source;
|
||||
pub const Scanner = struct {
|
||||
scanned: bool,
|
||||
line: u32,
|
||||
col: u32,
|
||||
current: u32,
|
||||
pos: u32,
|
||||
|
||||
// fn scanArrayList(self: Self, source: SourceType, token_array: std.ArrayListAligned(Token, null)) !std.ArrayListAligned(Token, null) {
|
||||
// _ = self;
|
||||
// var eof = false;
|
||||
// while (!eof) {
|
||||
// // switch(
|
||||
// }
|
||||
// }
|
||||
};
|
||||
pub fn init() Scanner {
|
||||
return .{ .scanned = false, .line = 1, .col = 1, .current = 0, .pos = 0 };
|
||||
}
|
||||
|
||||
fn scanSlice(self: *Scanner, source: []const u8, token_array: *std.ArrayListAligned(Token, null)) !void {
|
||||
var eof = false;
|
||||
blk: while (true) {
|
||||
eof = self.pos >= source.len;
|
||||
if (eof) {
|
||||
break :blk;
|
||||
}
|
||||
|
||||
self.current = 1;
|
||||
var char = source[self.pos];
|
||||
var lexeme: Lexeme =
|
||||
switch (char) {
|
||||
'<' => .LT,
|
||||
'>' => .GT,
|
||||
'\n' => .NEWLINE,
|
||||
'=' => .EQUALS,
|
||||
'\'' => .SINGLE_QUOTE,
|
||||
'"' => .DOUBLE_QOUTE,
|
||||
' ' => lex: {
|
||||
self.current = Lexeme.greedyCapture(.WHITESPACE, source, self.pos);
|
||||
break :lex .WHITESPACE;
|
||||
},
|
||||
else => lex: {
|
||||
if (match(source, "{{", self.pos)) {
|
||||
self.current = 2;
|
||||
break :lex .DOUBLE_OPEN_BRACE;
|
||||
} else if (match(source, "}}", self.pos)) {
|
||||
self.current = 2;
|
||||
break :lex .DOUBLE_CLOSE_BRACE;
|
||||
} else if (match(source, "/>", self.pos)) {
|
||||
self.current = 2;
|
||||
break :lex .GT_CLOSING;
|
||||
} else {
|
||||
std.debug.print("Capturing text at {d}...\n", .{self.pos});
|
||||
// TODO use greedyCapture instead
|
||||
self.current = captureText(source, self.pos);
|
||||
std.debug.print("Captured {d} characters\n", .{self.current});
|
||||
break :lex .TEXT;
|
||||
}
|
||||
},
|
||||
};
|
||||
var raw = source[self.pos .. self.pos + self.current];
|
||||
self.pos += self.current;
|
||||
|
||||
std.debug.print("{s}\n", .{raw});
|
||||
try token_array.append(Token{ .col = self.col, .lexeme = lexeme, .line = self.line, .raw = raw });
|
||||
self.col += self.current;
|
||||
|
||||
if (lexeme == .NEWLINE) {
|
||||
self.line += 1;
|
||||
self.col = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn match(source: []const u8, key: []const u8, pos: u32) bool {
|
||||
if (source.len < key.len) {
|
||||
return false;
|
||||
}
|
||||
var source_slice = source[pos .. pos + key.len];
|
||||
if (std.mem.eql(u8, source_slice, key)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Consumes all tokens of type `lexeme`
|
||||
/// Returns the position that the capture ends
|
||||
fn captureText(source: []const u8, pos: u32) u32 {
|
||||
var current: u32 = 1;
|
||||
var eof = false;
|
||||
blk: while (true) {
|
||||
eof = pos + current >= source.len;
|
||||
if (eof) {
|
||||
break :blk;
|
||||
}
|
||||
|
||||
var char = source[pos + current];
|
||||
switch (char) {
|
||||
'<', '>', '\n', '=', '\'', '"', ' ' => break :blk,
|
||||
'{' => {
|
||||
if (match(source, "{{", pos + current)) {
|
||||
break :blk;
|
||||
}
|
||||
},
|
||||
'}' => {
|
||||
if (match(source, "}}", pos + current)) {
|
||||
break :blk;
|
||||
}
|
||||
},
|
||||
'/' => {
|
||||
if (match(source, "/>", pos + current)) {
|
||||
break :blk;
|
||||
}
|
||||
},
|
||||
else => {},
|
||||
}
|
||||
current += 1;
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
fn capture(source: []const u8, lexeme: Lexeme, pos: u32) u32 {
|
||||
_ = pos;
|
||||
_ = lexeme;
|
||||
_ = source;
|
||||
}
|
||||
|
||||
fn scanSliceAlloc(self: *Scanner, source: []const u8, allocator: std.mem.Allocator) !std.ArrayListAligned(Token, null) {
|
||||
_ = allocator;
|
||||
_ = source;
|
||||
_ = self;
|
||||
}
|
||||
};
|
||||
|
||||
test "Scanner Test" {
|
||||
const html =
|
||||
\\ <a href='https://google.com'/>
|
||||
\\ <p>Another element</p>
|
||||
\\ <h1>{{zig_value}}</h1>
|
||||
;
|
||||
|
||||
var scanner = Scanner.init();
|
||||
var arr = std.ArrayListAligned(Token, null).init(std.testing.allocator);
|
||||
defer arr.deinit();
|
||||
scanner.scanSlice(html, &arr) catch |err| {
|
||||
std.debug.print("Got error: {any}", .{err});
|
||||
return error.Unexpected;
|
||||
};
|
||||
for (arr.items) |item| {
|
||||
std.debug.print("{d}:{d}\t{any}\n\t\t{s}\n", .{ item.line, item.col, item.lexeme, item.raw });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,19 +8,99 @@ pub const Token = struct {
|
|||
};
|
||||
|
||||
pub const Lexeme = enum {
|
||||
NEWLINE,
|
||||
LT,
|
||||
GT,
|
||||
CLOSING_LT,
|
||||
EQUALS,
|
||||
TAG,
|
||||
SINGLE_QUOTE,
|
||||
DOUBLE_QOUTE,
|
||||
TEXT,
|
||||
DOUBLE_OPEN_BRACE,
|
||||
DOUBLE_CLOSE_BRACE,
|
||||
NEWLINE, // \n
|
||||
WHITESPACE, // any whitespace
|
||||
LT, // <
|
||||
GT_CLOSING, // />
|
||||
GT, // >
|
||||
EQUALS, // =
|
||||
SINGLE_QUOTE, // '
|
||||
DOUBLE_QOUTE, // "
|
||||
TEXT, // any text
|
||||
DOUBLE_OPEN_BRACE, // {{
|
||||
DOUBLE_CLOSE_BRACE, // }}
|
||||
BACKSLASH, // /
|
||||
|
||||
pub fn greedyCapture(lexeme: Lexeme, source: []const u8, pos: u32) u32 {
|
||||
var current: u32 = 1;
|
||||
var eof = false;
|
||||
switch (lexeme) {
|
||||
.WHITESPACE => {
|
||||
blk: while (true) {
|
||||
eof = pos + current >= source.len;
|
||||
if (eof) {
|
||||
break :blk;
|
||||
}
|
||||
|
||||
var char = source[pos + current];
|
||||
switch (char) {
|
||||
' ' => {},
|
||||
else => break :blk,
|
||||
}
|
||||
current += 1;
|
||||
}
|
||||
},
|
||||
.TEXT => {
|
||||
blk: while (true) {
|
||||
eof = pos + current >= source.len;
|
||||
if (eof) {
|
||||
break :blk;
|
||||
}
|
||||
|
||||
var char = source[pos + current];
|
||||
switch (char) {
|
||||
'<', '>', '\n', '=', '\'', '"', ' ' => break :blk,
|
||||
'{' => {
|
||||
if (match(source, "{{", pos + current)) {
|
||||
break :blk;
|
||||
}
|
||||
},
|
||||
'}' => {
|
||||
if (match(source, "}}", pos + current)) {
|
||||
break :blk;
|
||||
}
|
||||
},
|
||||
'/' => {
|
||||
if (match(source, "/>", pos + current)) {
|
||||
break :blk;
|
||||
}
|
||||
},
|
||||
else => {},
|
||||
}
|
||||
current += 1;
|
||||
}
|
||||
},
|
||||
else => unreachable,
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
fn match(source: []const u8, key: []const u8, pos: u32) bool {
|
||||
if (source.len < key.len) {
|
||||
return false;
|
||||
}
|
||||
var source_slice = source[pos .. pos + key.len];
|
||||
if (std.mem.eql(u8, source_slice, key)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
// const LexemeChar = union(Lexeme) {
|
||||
// NEWLINE: u8, // \n
|
||||
// LT: u8, // <
|
||||
// GT_CLOSING: u8, // />
|
||||
// GT: u8, // >
|
||||
// EQUALS: u8, // =
|
||||
// SINGLE_QUOTE: u8, // '
|
||||
// DOUBLE_QOUTE: u8, // "
|
||||
// TEXT: u8, // any text
|
||||
// DOUBLE_OPEN_BRACE: u8, // {{
|
||||
// DOUBLE_CLOSE_BRACE: u8, // }}
|
||||
// BACKSLASH: u8, // /
|
||||
// };
|
||||
|
||||
pub const TagTypes = enum {
|
||||
a,
|
||||
abbr,
|
||||
|
@ -67,16 +147,5 @@ pub const TagTypes = enum {
|
|||
video,
|
||||
};
|
||||
|
||||
pub const TagAttributes = enum {
|
||||
href,
|
||||
alt,
|
||||
border,
|
||||
name,
|
||||
src,
|
||||
style,
|
||||
class,
|
||||
maxlength,
|
||||
name,
|
||||
onblur
|
||||
};
|
||||
pub const TagAttributes = enum { href, alt, border, name, src, style, class, maxlength, name, onblur };
|
||||
// <h3 class="gogo">text</h3>
|
||||
|
|
Loading…
Reference in New Issue
Block a user