diff --git a/Tests/LibWeb/TestHTMLTokenizerSwift.swift b/Tests/LibWeb/TestHTMLTokenizerSwift.swift
index e7e92bae417..b6be94539de 100644
--- a/Tests/LibWeb/TestHTMLTokenizerSwift.swift
+++ b/Tests/LibWeb/TestHTMLTokenizerSwift.swift
@@ -5,8 +5,8 @@
*/
import AK
-import Web
import Testing
+import Web
@Suite
struct TestHTMLTokenizerSwift {
@@ -30,4 +30,86 @@ struct TestHTMLTokenizerSwift {
#expect(!token.isParserWhitespace())
}
}
+
+ @Test func dataStateNoInput() {
+ let tokenizer = HTMLTokenizer()
+ #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
+
+ let token = tokenizer.nextToken()
+ #expect(token?.type == .EndOfFile)
+
+ let token2 = tokenizer.nextToken()
+ #expect(token2 == nil)
+ #expect(tokenizer.state == HTMLTokenizer.State.Data)
+ }
+
+ @Test func dataStateSingleChar() {
+ guard let tokenizer = HTMLTokenizer(input: "X") else {
+ Issue.record("Failed to create tokenizer for 'X'")
+ return
+ }
+ #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
+
+ let token = tokenizer.nextToken()
+ #expect(token?.type == .Character(codePoint: "X"))
+
+ let token2 = tokenizer.nextToken()
+ #expect(token2?.type == .EndOfFile)
+
+ let token3 = tokenizer.nextToken()
+ #expect(token3 == nil)
+ #expect(tokenizer.state == HTMLTokenizer.State.Data)
+ }
+
+ @Test func dataStateAmpersand() {
+ guard let tokenizer = HTMLTokenizer(input: "&") else {
+ Issue.record("Failed to create tokenizer for '&'")
+ return
+ }
+ #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
+
+ let token = tokenizer.nextToken()
+ #expect(token?.type == .EndOfFile)
+ #expect(tokenizer.state == HTMLTokenizer.State.CharacterReference)
+
+ let token2 = tokenizer.nextToken()
+ #expect(token2 == nil)
+ }
+
+ @Test func dataStateTagOpen() {
+ guard let tokenizer = HTMLTokenizer(input: "<") else {
+ Issue.record("Failed to create tokenizer for '<'")
+ return
+ }
+ #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
+
+ let token = tokenizer.nextToken()
+ #expect(token?.type == .EndOfFile)
+ #expect(tokenizer.state == HTMLTokenizer.State.TagOpen)
+
+ let token2 = tokenizer.nextToken()
+ #expect(token2 == nil)
+ }
+
+ @Test func dataStateNulChar() {
+ guard let tokenizer = HTMLTokenizer(input: "H\0I") else {
+ Issue.record("Failed to create tokenizer for 'H\\0I'")
+ return
+ }
+ #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
+
+ let token = tokenizer.nextToken()
+ #expect(token?.type == .Character(codePoint: "H"))
+
+ let token2 = tokenizer.nextToken()
+ #expect(token2?.type == .Character(codePoint: "\u{FFFD}"))
+
+ let token3 = tokenizer.nextToken()
+ #expect(token3?.type == .Character(codePoint: "I"))
+
+ let token4 = tokenizer.nextToken()
+ #expect(token4?.type == .EndOfFile)
+
+ #expect(tokenizer.state == HTMLTokenizer.State.Data)
+ }
}
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift
index fe0045f2a28..c5920d13cc6 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift
@@ -7,13 +7,13 @@
@_exported import WebCxx
public class HTMLToken {
- public struct Position {
+ public struct Position: Equatable {
var line = UInt()
var column = UInt()
var byteOffset = UInt()
}
- public struct Attribute {
+ public struct Attribute: Equatable {
var prefix: Swift.String?
var localName: Swift.String
var namespace_: Swift.String?
@@ -24,7 +24,7 @@ public class HTMLToken {
var valueEndPosition: Position
}
- public enum TokenType {
+ public enum TokenType: Equatable {
case Invalid
case DOCTYPE(
name: Swift.String?,
@@ -79,7 +79,7 @@ public class HTMLToken {
}
}
-extension HTMLToken.Position: Equatable, CustomStringConvertible {
+extension HTMLToken.Position: CustomStringConvertible {
public var description: Swift.String {
return "\(self.line):\(self.column)"
}
@@ -109,13 +109,11 @@ extension HTMLToken.TokenType: CustomStringConvertible {
extension HTMLToken: CustomStringConvertible {
public var description: Swift.String {
- if (self.startPosition == Position()) {
+ if self.startPosition == Position() {
return "HTMLToken(type: \(self.type))"
- }
- else if (self.endPosition == Position()) {
+ } else if self.endPosition == Position() {
return "HTMLToken(type: \(self.type))@\(self.startPosition)"
- }
- else {
+ } else {
return "HTMLToken(type: \(self.type))@\(self.startPosition)-\(self.endPosition)"
}
}
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift
index 8d1aebc5c45..79bce616082 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift
@@ -20,9 +20,9 @@ extension Swift.String {
}
}
-class HTMLTokenizer {
+public class HTMLTokenizer {
- enum State {
+ public enum State {
case Data
case RCDATA
case RAWTEXT
@@ -105,29 +105,224 @@ class HTMLTokenizer {
case NumericCharacterReferenceEnd
}
- var input = Swift.String()
- var state = State.Data
- var returnState = State.Data
+ private var input = Swift.String()
+ private var cursor: Swift.String.Index
+ private var previousCursor: Swift.String.Index
- var currentToken = HTMLToken()
- var queuedTokens = Deque()
+ public private(set) var state = State.Data
+ private var returnState = State.Data
- public init() {}
+ private var currentToken = HTMLToken()
+ private var queuedTokens = Deque()
+
+ private var aborted = false
+ private var hasEmittedEOF = false
+
+ public init() {
+ self.cursor = self.input.startIndex
+ self.previousCursor = self.input.startIndex
+ }
public init?(input: AK.StringView, encoding: AK.StringView) {
if let string = Swift.String(decoding: input, as: encoding) {
self.input = string
} else {
return nil
}
+ self.cursor = self.input.startIndex
+ self.previousCursor = self.input.startIndex
+ }
+
+ public convenience init?(input: AK.StringView) {
+ self.init(input: input, encoding: "UTF-8")
+ }
+
+ public func abort() {
+ self.aborted = true
+ }
+
+ func skip(_ count: Int) {
+ self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex
+ self.previousCursor = self.input.index(before: self.cursor)
+ }
+
+ func peekCodePoint(_ offset: Int = 0) -> Character? {
+ guard let index = self.input.index(self.cursor, offsetBy: offset, limitedBy: self.input.index(before: self.input.endIndex)) else {
+ return nil
+ }
+ return self.input[index]
+ }
+
+ func nextCodePoint() -> Character? {
+ guard self.cursor < self.input.endIndex else {
+ return nil
+ }
+
+ // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
+ // https://infra.spec.whatwg.org/#normalize-newlines
+ var codePoint: Character
+ if let peeked = peekCodePoint(), let peekedNext = peekCodePoint(1), peeked == "\r", peekedNext == "\n" {
+ // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
+ skip(2)
+ codePoint = "\n"
+ } else if let peeked = peekCodePoint(), peeked == "\r" {
+ // replace every remaining U+000D CR code point with a U+000A LF code point.
+ skip(1)
+ codePoint = "\n"
+ } else {
+ skip(1)
+ codePoint = self.input[self.previousCursor]
+ }
+ return codePoint
+ }
+
+ func restoreCursorToPrevious() {
+ self.cursor = self.previousCursor
+ }
+
+ func createNewToken(_ token: HTMLToken) {
+ self.currentToken = token
+ // FIXME: Assign Position
+ }
+
+ enum NextTokenState {
+ case Emit(token: HTMLToken?)
+ case SwitchTo
+ case Reconsume(inputCharacter: Character?)
+ case ReprocessQueue
}
public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
- while !queuedTokens.isEmpty {
- return queuedTokens.popFirst()
+ let processQueue = { () -> HTMLToken?? in
+ if let token = self.queuedTokens.popFirst() {
+ return token
+ }
+ return self.aborted ? Optional(nil) : nil
}
- return nil
+ if let maybeToken = processQueue() {
+ return maybeToken
+ }
+
+ var nextInputCharacter: Character? = nil
+ while true {
+ // FIXME: Handle insertion point
+ switch nextTokenImpl(nextInputCharacter) {
+ case .Emit(let token):
+ return token
+ case .SwitchTo:
+ nextInputCharacter = nil
+ break
+ case .Reconsume(let character):
+ nextInputCharacter = character
+ break
+ case .ReprocessQueue:
+ if let maybeToken = processQueue() {
+ return maybeToken
+ }
+ nextInputCharacter = nil
+ break
+ }
+ }
}
+ func switchTo(_ state: State) -> NextTokenState {
+ self.state = state
+ return .SwitchTo
+ }
+
+ func reconsume(_ character: Character, `in` state: State) -> NextTokenState {
+ self.state = state
+ return .Reconsume(inputCharacter: character)
+ }
+
+ func switchToReturnState() -> NextTokenState {
+ self.state = self.returnState
+ return .ReprocessQueue
+ }
+
+ func reconsumeInReturnState(_ character: Character?) -> NextTokenState {
+ self.state = self.returnState
+ if character != nil {
+ restoreCursorToPrevious()
+ }
+ return .ReprocessQueue
+ }
+
+ func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState {
+ self.state = state
+ self.queuedTokens.append(self.currentToken)
+ self.currentToken = HTMLToken()
+ return .Emit(token: self.queuedTokens.popFirst()!)
+ }
+
+ func switchToAndEmitCharacter(_ state: State, character: Character) -> NextTokenState {
+ self.state = state
+ return emitCharacter(character)
+ }
+
+ func emitCharacterAndReconsume(_ character: Character, `in`: State, currentInputCharacter: Character?) -> NextTokenState {
+ self.queuedTokens.append(HTMLToken(type: .Character(codePoint: character)))
+ self.state = `in`
+ return .Reconsume(inputCharacter: currentInputCharacter)
+ }
+
+ func emitEOF() -> NextTokenState {
+ if self.hasEmittedEOF {
+ return .Emit(token: nil)
+ }
+ self.hasEmittedEOF = true
+ createNewToken(HTMLToken(type: .EndOfFile))
+ self.queuedTokens.append(self.currentToken)
+ self.currentToken = HTMLToken()
+ return .Emit(token: self.queuedTokens.popFirst()!)
+ }
+
+ func emitCurrentTokenFollowedByEOF() -> NextTokenState {
+ precondition(!self.hasEmittedEOF)
+ self.queuedTokens.append(self.currentToken)
+ self.currentToken = HTMLToken()
+ return emitEOF()
+ }
+
+ func emitCharacter(_ character: Character) -> NextTokenState {
+ createNewToken(HTMLToken(type: .Character(codePoint: character)))
+ self.queuedTokens.append(self.currentToken)
+ self.currentToken = HTMLToken()
+ return .Emit(token: self.queuedTokens.popFirst()!)
+ }
+
+ func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState {
+ let dontConsumeNextInputCharacter = {
+ self.restoreCursorToPrevious()
+ }
+ let _ = dontConsumeNextInputCharacter
+
+ // FIXME: flushCodepointsConsumedAsACharacterReference needs currentBuilder
+
+ // Handle reconsume by passing the character around in the state enum
+ let currentInputCharacter = nextInputCharacter ?? nextCodePoint()
+
+ switch self.state {
+ // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
+ case .Data:
+ switch currentInputCharacter {
+ case "&":
+ self.returnState = .Data
+ return switchTo(.CharacterReference)
+ case "<":
+ return switchTo(.TagOpen)
+ case "\0":
+ // FIXME: log_parse_error()
+ return emitCharacter("\u{FFFD}")
+ case nil:
+ return emitEOF()
+ default:
+ return emitCharacter(currentInputCharacter!)
+ }
+ default:
+ print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
+ return emitEOF()
+ }
+ }
}