A naïve attempt at parsing CSS in JavaScript – Part 10: I think it works
This latest version of the CSS tokenizer appears to work. It turns this input:
/* c */
#w{width:calc(+100%);content:'kfdl';background-image:url(http://example.com/a.png)}
Into this output:
[
"t": "comment-token", "v": "/* c */",
"t": "whitespace-token", "v": "\n",
"t": "hash-token-id", "v": "#w",
"t": "left-curly-token", "v": "{",
"t": "ident-token", "v": "width",
"t": "colon-token", "v": ":",
"t": "function-token", "v": "calc(",
"t": "percentage-token", "v": "+100%",
"t": "right-parentheses-token", "v": ")",
"t": "semi-colon-token", "v": ";",
"t": "ident-token", "v": "content",
"t": "colon-token", "v": ":",
"t": "string-token", "v": "'kfdl'",
"t": "semi-colon-token", "v": ";",
"t": "ident-token", "v": "background-image",
"t": "colon-token", "v": ":",
"t": "url-token", "v": "url(http://example.com/a.png)",
"t": "right-curly-token", "v": "}"
]
It appears to be handling comments, whitespace, hash tokens, ident tokens, function tokens, various symbols tokens, urls, and numeric tokens more or less as I would expect.
The full code is here:
const letterPattern = /[a-zA-Z]/;
const digitPattern = /[0-9]/;
/**
* https://www.w3.org/TR/css-syntax-3/#digit
* @param {string} char
*/
const isDigitCharCode = charCode => {
return charCode >= 48 || charCode <= 57;
};
const isDigit = char => {
const charCode = char.charCodeAt(0);
if (isNaN(charCode)) {
return false;
}
return isDigitCharCode(char);
};
const isNonAscii = char => char.charCodeAt(0) > 128;
const isWhitespace = char => char === ' ' || char === '\n' || char === '\t';
const isQuotationMark = char => char === '"' || char === "'";
/**
* https://www.w3.org/TR/css-syntax-3/#hex-digit
* @param {string} char
*/
const isHexDigit = char => {
const charCode = char.charCodeAt(0);
if (isNaN(charCode)) {
return false;
}
return (
isDigitCharCode(charCode) ||
(charCode >= 65 && charCode <= 70) ||
(charCode >= 97 && charCode <= 102)
);
};
/**
* https://www.w3.org/TR/css-syntax-3/#non-printable-code-point
* @param {string} char
*/
const isNonPrintable = char => {
const charCode = char.charCodeAt(0);
return (
isNaN(charCode) ||
// A code point between U+0000 NULL and U+0008 BACKSPACE inclusive,
charCode <= 8 ||
// or U+000B LINE TABULATION
charCode === 11 ||
// a code point between U+000E SHIFT OUT and U+001F INFORMATION SEPARATOR
// ONE inclusive
(charCode >= 14 && charCode <= 31) ||
// U+007F DELETE
charCode === 127
);
};
// https://www.w3.org/TR/css-syntax-3/#name-start-code-point
const isNameStartCodePoint = char => {
const result = char === '_' || isNonAscii(char) || letterPattern.test(char);
return result;
};
// https://www.w3.org/TR/css-syntax-3/#name-code-point
const isNameCodePoint = char => {
const result =
char === '-' || digitPattern.test(char) || isNameStartCodePoint(char);
return result;
};
// https://www.w3.org/TR/css-syntax-3/#starts-with-a-valid-escape
const isValidEscape = (next, nPlus1) => {
if (next !== '\\') {
return false;
}
if (nPlus1 === '\n') {
return false;
}
return true;
};
// https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
const startsIdentifier = (n, nPlus1, nPlus2) => {
if (n === '-') {
return (
isNameStartCodePoint(nPlus1) ||
nPlus1 === '-' ||
isValidEscape(nPlus1, nPlus2)
);
}
if (isNameStartCodePoint(n)) {
return true;
}
if (n === '\\' && isValidEscape(n, nPlus1)) {
return true;
}
return false;
};
// https://www.w3.org/TR/css-syntax-3/#starts-with-a-number
const startsNumber = (n, nPlus1, nPlus2) => {
if (n === '-' || n === '+') {
if (digitPattern.test(nPlus1)) {
return true;
}
if (nPlus1 === '.' && digitPattern.test(nPlus2)) {
return true;
}
return false;
}
if (n === '.') {
return digitPattern.test(nPlus1);
}
return digitPattern.test(n);
};
class CssFakeStream {
/**
* @type {string}
*/
css = '';
constructor(css = '') {
this.css = css;
}
/**
* @returns {string}
*/
get next() {
return this.css.charAt(0);
}
/**
* @returns {string}
*/
get nextPlus1() {
return this.css.charAt(1);
}
/**
* @returns {string}
*/
get nextPlus2() {
return this.css.charAt(2);
}
/**
* @returns {string}
*/
get nextPlus3() {
return this.css.charAt(3);
}
/**
* @returns {boolean}
*/
hasNext() {
return !isNaN(this.css.charCodeAt(0));
}
/**
* Removes a single characters from the start of the CSS stream then removes
* it.
* @returns {string}
*/
consume() {
const char = this.css.charAt(0);
this.css = this.css.substring(1);
return char;
}
/**
* Reconsumes a character (prepends to stream).
* @param {string} char
*/
reconsume(char) {
this.css = char + this.css;
}
}
/**
* https://www.w3.org/TR/css-syntax-3/#consume-comment
* This returns a token so the comment can be highlighted and rendered. In the
* spec, consuming comments would not return anything.
* @param {CssFakeStream} stream
*/
const consumeCommentToken = stream => {
let reachedCommentEnd = false;
let comment = '';
let last = '';
do {
const char = stream.consume();
comment += char;
reachedCommentEnd = last === '*' && char === '/';
last = char;
} while (stream.hasNext() && !reachedCommentEnd);
return {
t: 'comment-token',
v: comment,
};
};
/**
* https://www.w3.org/TR/css-syntax-3/#consume-token
* @param {CssFakeStream} stream
*/
const consumeWhitespaceToken = stream => {
let whitespace = '';
do {
whitespace += stream.consume();
} while (stream.hasNext() && isWhitespace(stream.next));
return {
t: 'whitespace-token',
v: whitespace,
};
};
/**
* https://www.w3.org/TR/css-syntax-3/#consume-a-name
* @param {CssFakeStream} stream
*/
const consumeName = stream => {
let result = '';
let char = '';
let nameEndReached = false;
do {
char = stream.consume();
if (isNameCodePoint(char)) {
result += char;
} else if (isValidEscape(char, stream.next)) {
result += char + stream.consume();
} else {
if (char) {
stream.reconsume(char);
}
nameEndReached = true;
}
} while (stream.hasNext() && !nameEndReached);
return result;
};
/**
* https://www.w3.org/TR/css-syntax-3/#ref-for-name-code-point
* @param {CssFakeStream} stream
*/
const consumeHashToken = stream => {
let hashToken = {
t: 'hash-token',
v: stream.consume(),
};
if (startsIdentifier(stream.next, stream.nextPlus1, stream.nextPlus2)) {
// In the spec, this would set a type flag to id, but we just need a new
// general type class for styling.
hashToken.t += '-id';
}
const name = consumeName(stream);
hashToken.v += name;
return hashToken;
};
/**
* Consumes a single character of the given token type.
* @param {CssFakeStream} stream
* @param {string} tokenType
*/
const consumeGenericToken = (stream, tokenType, numberOfChars = 1) => {
let v = '';
let i = 0;
while (i++ < numberOfChars && stream.hasNext()) {
v += stream.consume();
}
return {
t: tokenType,
v,
};
};
/**
* https://www.w3.org/TR/css-syntax-3/#consume-a-string-token
* @param {CssFakeStream} stream
*/
const consumeStringToken = stream => {
const endChar = stream.next;
let cache = stream.consume();
while (true) {
const currentChar = stream.consume();
if (currentChar === endChar) {
cache += currentChar;
return {
t: 'string-token',
v: cache,
};
}
// If following the spec, this would be a parse error
if (!stream.hasNext()) {
cache += currentChar;
return {
t: 'string-token',
v: cache,
};
}
// If following the spec, this would be a parse error
if (currentChar === '\n') {
stream.reconsume(currentChar);
return {
t: 'string-token',
v: cache,
};
}
if (currentChar === '\\') {
if (!stream.hasNext()) {
return null;
}
// TODO: implement full consume escaped code point algorithm
// https://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
cache += currentChar + stream.consume();
} else {
cache += currentChar;
}
}
};
/**
* https://www.w3.org/TR/css-syntax-3/#consume-a-number
* @param {CssFakeStream} stream
* @returns {string}
*/
const consumeNumber = stream => {
// The specification deals with number types. However, we this implementation
// ignores that and just consumes number characters as long as it finds them.
// Initially set type to "integer". Let repr be the empty string.
let repr = '';
// If the next input code point is U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS
// (-), consume it and append it to repr.
if (stream.next === '+' || stream.next === '-') {
repr += stream.consume();
}
// While the next input code point is a digit, consume it and append it to
// repr.
while (stream.hasNext() && digitPattern.test(stream.next)) {
repr += stream.consume();
}
// If the next 2 input code points are U+002E FULL STOP (.) followed by a
// digit, then:
if (stream.next === '.' && digitPattern.test(stream.nextPlus1)) {
// Consume them.
// Append them to repr.
repr += stream.consume(); // Consume '.'
repr += stream.consume(); // Consume first digit
// While the next input code point is a digit, consume it and append it to
// repr.
while (stream.hasNext() && digitPattern.test(stream.next)) {
repr += stream.consume();
}
}
// If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E)
// or U+0065 LATIN SMALL LETTER E (e), optionally followed by U+002D
// HYPHEN-MINUS (-) or U+002B PLUS SIGN (+), followed by a digit, then
if (
(stream.next === 'E' || stream.next === 'e') &&
digitPattern.test(stream.nextPlus1)
) {
repr += stream.consume(); // Consume e character
repr += stream.consume(); // Consume first digit
// While the next input code point is a digit, consume it and append it to
// repr.
while (stream.hasNext() && digitPattern.test(stream.next)) {
repr += stream.consume();
}
} else if (
(stream.next === 'E' || stream.next === 'e') &&
(stream.nextPlus1 === '-' || stream.nextPlus1 === '+')(
digitPattern.test(stream.nextPlus2)
)
) {
repr += stream.consume(); // Consume e character
repr += stream.consume(); // Consume sign
repr += stream.consume(); // Consume first digit
// While the next input code point is a digit, consume it and append it to
// repr.
while (stream.hasNext() && digitPattern.test(stream.next)) {
repr += stream.consume();
}
}
return repr;
};
/**
* https://www.w3.org/TR/css-syntax-3/#consume-a-numeric-token
* @param {CssFakeStream} stream
*/
const consumeNumericToken = stream => {
// Consume a number and let number be the result.
let number = consumeNumber(stream);
// If the next 3 input code points would start an identifier, then:
if (startsIdentifier(stream.next, stream.nextPlus1, stream.nextPlus2)) {
// Create a <dimension-token> with the same value and type flag as number,
// and a unit set initially to the empty string.
const dimensionToken = {
t: 'dimension-token',
v: number,
};
// Consume a name. Set the <dimension-token>’s unit to the returned value.
dimensionToken.v += consumeName(stream);
// Return the <dimension-token>.
return dimensionToken;
}
// Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%),
// consume it.
else if (stream.next === '%') {
// Create a <percentage-token> with the same value as number, and return it.
const percentageToken = {
t: 'percentage-token',
v: `${number}${stream.consume()}`,
};
return percentageToken;
} else {
// Otherwise, create a <number-token> with the same value and type flag as
// number, and return it.
return {
t: 'number-token',
v: number,
};
}
};
/**
* https://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
* @param {CssFakeStream} stream
*/
const consumeEscapedCodePoint = stream => {
let result = stream.consume();
if (isHexDigit(result)) {
let consumedHexDigits = 0;
// Consume as many hex digits as possible, but no more than 5. Note that
// this means 1-6 hex digits have been consumed in total.
while (
stream.hasNext() &&
isHexDigit(stream.next) &&
consumedHexDigits < 5
) {
result += stream.consume();
consumedHexDigits += 1;
}
// If the next input code point is whitespace, consume it as well.
if (isWhitespace(stream.next)) {
result += stream.consume();
}
// Ignoring part of the spec about interpreting the hex number, this
// implementation is only used for syntax highlighting.
return result;
}
if (result === '') {
// This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�).
return String.fromCharCode(parseInt('FFFD', 16));
} else {
return result;
}
};
/**
* https://www.w3.org/TR/css-syntax-3/#consume-the-remnants-of-a-bad-url
*
* Deviates from the standard because this implementation does not throw away
* invalid characters, rather shows them as part of syntax highlighting.
*
* @param {CssFakeStream} stream
*/
const consumeRemnantsOfABadUrl = stream => {
let result = '';
// Repeatedly consume the next input code point from the stream:
while (stream.hasNext()) {
const char = stream.consume();
result += char;
if (char === ')') {
return result;
}
if (isValidEscape(char, stream.next)) {
// Consume an escaped code point. This allows an escaped right
// parenthesis ("\)") to be encountered without ending the
// <bad-url-token>. This is otherwise identical to the "anything else"
// clause.
result += consumeEscapedCodePoint(stream);
}
}
return result;
};
/**
* https://www.w3.org/TR/css-syntax-3/#consume-a-url-token
* @param {CssFakeStream} stream
*/
const consumeUrl = (stream, prefix = '') => {
// anything else
// Append the current input code point to the <url-token>’s value.
// Initially create a <url-token> with its value set to the empty string.
const urlToken = {
t: 'url-token',
v: prefix, // prefix added so url( is displayed at the front
};
// Consume as much whitespace as possible.
while (stream.hasNext() && isWhitespace(stream.next)) {
urlToken.v += stream.consume();
}
// Repeatedly consume the next input code point from the stream:
while (stream.hasNext()) {
let char = stream.consume();
urlToken.v += char;
if (char === ')') {
// Return the <url-token>.
return urlToken;
} else if (isWhitespace(char)) {
// Consume as much whitespace as possible.
while (stream.hasNext() && isWhitespace(stream.next)) {
urlToken.v += stream.consume();
}
// If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF,
// consume it and return the <url-token> (if EOF was encountered, this is
// a parse error)
if (!stream.hasNext()) {
urlToken.e = 'parse-error';
} else if (stream.next === ')') {
urlToken.v += stream.consume();
return urlToken;
}
urlToken.v += consumeRemnantsOfABadUrl(stream);
urlToken.t = 'bad-url-token';
return urlToken;
} else if (
char === '"' ||
char === "'" ||
char === '(' ||
isNonPrintable(char)
) {
// This is a parse error. Consume the remnants of a bad url, create a
// <bad-url-token>, and return it.
urlToken.e = 'parse-error';
urlToken.t = 'bad-url-token';
urlToken.v += consumeRemnantsOfABadUrl(stream);
return urlToken;
} else if (char === '\\') {
if (isValidEscape(char, stream.next)) {
// If the stream starts with a valid escape, consume an escaped code point
// and append the returned code point to the <url-token>’s value.
urlToken.v += consumeEscapedCodePoint(stream);
} else {
urlToken.t = 'bad-url-token';
urlToken.e = 'parse-error';
urlToken.v += consumeRemnantsOfABadUrl(stream);
}
}
}
// This is a parse error (reached EOF). Return the <url-token>.
urlToken.e = 'parse-error';
return urlToken;
};
/**
* https://www.w3.org/TR/css-syntax-3/#consume-an-ident-like-token
* @param {CssFakeStream} stream
*/
const consumeIdentLikeToken = stream => {
// Consume a name, and let string be the result.
let str = consumeName(stream);
// If string’s value is an ASCII case-insensitive match for "url", and the
// next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
if (str.toLowerCase() === 'url' && stream.next === '(') {
str += stream.consume();
// While the next two input code points are whitespace, consume the next
// input code point.
while (isWhitespace(stream.next) && isWhitespace(stream.nextPlus1)) {
str += stream.consume();
}
// If the next one or two input code points are U+0022 QUOTATION MARK
// ("), U+0027 APOSTROPHE ('), or whitespace followed by U+0022 QUOTATION
// MARK (") or U+0027 APOSTROPHE ('), then create a <function-token> with
// its value set to string and return it.
if (stream.next === '"' || stream.next === "'") {
return {
t: 'function-token',
v: str,
};
} else if (
isWhitespace(stream.next) &&
(stream.nextPlus1 === '"' || stream.nextPlus1 === "'")
) {
return {
t: 'function-token',
v: str,
};
} else {
// Otherwise, consume a url token, and return it.
return consumeUrl(stream, str);
}
} else if (stream.next === '(') {
// Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((),
// consume it. Create a <function-token> with its value set to string and
// return it.
return {
t: 'function-token',
v: str + stream.consume(),
};
} else {
return {
t: 'ident-token',
v: str,
};
}
};
/**
* https://www.w3.org/TR/css-syntax-3/#ref-for-typedef-at-keyword-token%E2%91%A3
* @param {CssFakeStream} stream
*/
const consumeAtKeywordToken = stream => {
// Consume @, then the rest of the name
let name = stream.consume() + consumeName(stream);
return {
t: 'at-keyword-token',
v: name,
};
};
// https://www.w3.org/TR/css-syntax-3/#tokenizer-algorithms
const parse = cssStr => {
const tokens = [];
const stream = new CssFakeStream(cssStr);
let cache = '';
const consumePlainToken = () => {
if (cache) {
tokens.push({
t: 'plain-token',
v: cache,
});
cache = '';
}
};
const consumeToken = (consumeFunction, ...args) => {
consumePlainToken();
const token = consumeFunction(stream, ...args);
if (token) {
tokens.push(token);
}
};
while (stream.hasNext()) {
if (stream.next === '/' && stream.nextPlus1 === '*') {
// Consume comments.
consumeToken(consumeCommentToken);
} else if (isWhitespace(stream.next)) {
// whitespace
consumeToken(consumeWhitespaceToken);
} else if (isQuotationMark(stream.next)) {
// Consume a string token and return it.
consumeToken(consumeStringToken);
} else if (stream.next === '#') {
// If the next input code point is a name code point or the next two input
// code points are a valid escape, then:
if (
isNameCodePoint(stream.nextPlus1) ||
isValidEscape(stream.nextPlus1, stream.nextPlus2)
) {
consumeToken(consumeHashToken);
} else {
// Otherwise, return a <delim-token> with its value set to the current
// input code point.
consumeToken(consumeGenericToken, 'delim-token');
}
} else if (stream.next === '(') {
// Return a <(-token>.
consumeToken(consumeGenericToken, 'left-parentheses-token');
} else if (stream.next === ')') {
// Return a <)-token>.
consumeToken(consumeGenericToken, 'right-parentheses-token');
} else if (stream.next === '+') {
// If the input stream starts with a number, reconsume the current input
// code point, consume a numeric token and return it.
if (startsNumber(stream.nextPlus1, stream.nextPlus2, stream.nextPlus3)) {
consumeToken(consumeNumericToken);
} else {
// Otherwise, return a <delim-token> with its value set to the current
// input code point.
consumeToken(consumeGenericToken, 'delim-token');
}
} else if (stream.next === ',') {
consumeToken(consumeGenericToken, 'comma-token');
} else if (stream.next === '-') {
// If the input stream starts with a number, reconsume the current input
// code point, consume a numeric token, and return it.
if (startsNumber(stream.nextPlus1, stream.nextPlus2, stream.nextPlus3)) {
consumeToken(consumeNumericToken);
} else if (stream.nextPlus1 === '-' && stream.nextPlus2 === '>') {
// Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS
// U+003E GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
consumeToken(consumeGenericToken, 'cdc-token', 3);
} else if (startsIdentifier(stream.nextPlus1, stream.nextPlus2)) {
// Otherwise, if the input stream starts with an identifier, reconsume
// the current input code point, consume an ident-like token, and return
// it.
consumeToken(consumeIdentLikeToken);
} else {
// Otherwise, return a <delim-token> with its value set to the current
// input code point.
consumeToken(consumeGenericToken, 'delim-token');
}
} else if (stream.next === '.') {
// If the input stream starts with a number, reconsume the current input
// code point, consume a numeric token, and return it.
if (startsNumber(stream.nextPlus1, stream.nextPlus2, stream.nextPlus3)) {
consumeToken(consumeNumericToken);
} else {
// Otherwise, return a <delim-token> with its value set to the current
// input code point.
consumeToken(consumeGenericToken, 'delim-token');
}
} else if (stream.next === ':') {
consumeToken(consumeGenericToken, 'colon-token');
} else if (stream.next === ';') {
consumeToken(consumeGenericToken, 'semi-colon-token');
} else if (stream.next === '<') {
if (
stream.nextPlus1 === '!' &&
stream.nextPlus2 === '-' &&
stream.nextPlus3 === '-'
) {
consumeToken(consumeGenericToken, 'cdo-token', 4);
} else {
consumeToken(consumeGenericToken, 'delim-token');
}
} else if (stream.next === '@') {
if (
startsIdentifier(stream.nextPlus1, stream.nextPlus1, stream.nextPlus3)
) {
return consumeToken(consumeAtKeywordToken);
} else {
consumeToken(consumeGenericToken, 'delim-token');
}
} else if (stream.next === '[') {
consumeToken(consumeGenericToken, 'left-bracket-token');
} else if (stream.next === '\\') {
if (isValidEscape(stream.next, stream.nextPlus1)) {
consumeToken(consumeIdentLikeToken);
} else {
consumeToken(consumeGenericToken, 'delim-token');
}
} else if (stream.next === ']') {
consumeToken(consumeGenericToken, 'right-bracket-token');
} else if (stream.next === '{') {
consumeToken(consumeGenericToken, 'left-curly-token');
} else if (stream.next === '}') {
consumeToken(consumeGenericToken, 'right-curly-token');
} else if (isDigit(stream.next)) {
consumeToken(consumeNumericToken);
} else if (isNameStartCodePoint(stream.next)) {
consumeToken(consumeIdentLikeToken);
} else {
cache += stream.consume();
}
}
consumePlainToken();
return tokens;
};
const cssInput = `/* c */
#w{width:calc(+100%);content:'kfdl';background-image:url(http://example.com/a.png)}`;
const cssOutput = parse(cssInput);
const cssStrOutput = cssOutput.reduce((prev, current) => {
if (typeof prev === 'string') {
return `${prev}${current.v}`
}
return `${prev.v}${current.v}`
})
console.log(
JSON.stringify(parse(cssInput), null, 2)
.replace(/\{\n\s*/g, ' ')
.replace(/\n\s*\}/g, ' ')
.replace(/",\n\s*/g, '", ')
.replace(/ ,\n/g, ',\n')
);