A naïve attempt at parsing CSS in JavaScript – Part 9: parsing numbers
This iteration of developing the parse was quite fun because the specification turns into code nearly word-for-word. I included comments in the consumeNumber function which use the wording of the CSS tokenizer specification alongside the code interpretation of specification.
It is not a perfect implementation of the specification, as this naive tokeniser only has the goal to highlight CSS text in a webpage, not actually use the CSS for rendering purposes.
Here it the consumeNumber function snippet:
/**
* https://www.w3.org/TR/css-syntax-3/#consume-a-number
* @param {CssFakeStream} stream
* @returns {string}
*/
const consumeNumber = stream => {
// The specification deals with number types. However, we this implementation
// ignores that and just consumes number characters as long as it finds them.
// Initially set type to "integer". Let repr be the empty string.
let repr = ''
// If the next input code point is U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS
// (-), consume it and append it to repr.
if (stream.next === '+' || stream.next === '-') {
repr += stream.consume()
}
// While the next input code point is a digit, consume it and append it to
// repr.
while (stream.hasNext() && digitPattern.test(stream.next)) {
repr += stream.consume()
}
// If the next 2 input code points are U+002E FULL STOP (.) followed by a
// digit, then:
if (stream.next === '.' && digitPattern.test(stream.nextPlus1)) {
// Consume them.
// Append them to repr.
repr += stream.consume() // Consume '.'
repr += stream.consume() // Consume first digit
// While the next input code point is a digit, consume it and append it to
// repr.
while (stream.hasNext() && digitPattern.test(stream.next)) {
repr += stream.consume()
}
}
// If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E)
// or U+0065 LATIN SMALL LETTER E (e), optionally followed by U+002D
// HYPHEN-MINUS (-) or U+002B PLUS SIGN (+), followed by a digit, then
if (
(stream.next === 'E' || stream.next === 'e') &&
(digitPattern.test(stream.nextPlus1))
) {
repr += stream.consume() // Consume e character
repr += stream.consume() // Consume first digit
// While the next input code point is a digit, consume it and append it to
// repr.
while (stream.hasNext() && digitPattern.test(stream.next)) {
repr += stream.consume()
}
}
else if (
(stream.next === 'E' || stream.next === 'e') &&
(stream.nextPlus1 === '-' || stream.nextPlus1 === '+')
(digitPattern.test(stream.nextPlus2))
) {
repr += stream.consume() // Consume e character
repr += stream.consume() // Consume sign
repr += stream.consume() // Consume first digit
// While the next input code point is a digit, consume it and append it to
// repr.
while (stream.hasNext() && digitPattern.test(stream.next)) {
repr += stream.consume()
}
}
return repr;
}
When running the full code on CSS that looks like this:
/* c */
#w{width:calc(+100%);content:'kfdl';}
The result is:
[
{ "t": "comment-token", "v": "/* c */" },
{ "t": "whitespace-token", "v": "\n" },
{ "t": "hash-token-id", "v": "#w" },
{ "t": "plain-token", "v": "{width:calc" },
{ "t": "left-parentheses-token", "v": "(" },
{ "t": "percentage-token", "v": "+100%" },
{ "t": "right-parentheses-token","v": ")" },
{ "t": "plain-token", "v": ";content:" },
{ "t": "string-token", "v": "'kfdl'" },
{ "t": "plain-token", "v": ";}" }
]
Still a few items which are currently plain-token items to consume as tokens based on the specification, but it’s starting to look like a good number of tokens based on the specification.
Here is the full code:
const letterPattern = /[a-zA-Z]/;
const digitPattern = /[0-9]/;
const isNonAscii = char => char.charCodeAt(0) > 128;
const isWhitespace = char => char === ' ' || char === '\n' || char === '\t';
const isQuotationMark = char => char === '"' || char === "'";
// https://www.w3.org/TR/css-syntax-3/#name-start-code-point
const isNameStartCodePoint = char => {
const result = char === '_' || isNonAscii(char) || letterPattern.test(char);
return result;
};
// https://www.w3.org/TR/css-syntax-3/#name-code-point
const isNameCodePoint = char => {
const result =
char === '-' || digitPattern.test(char) || isNameStartCodePoint(char);
return result;
};
// https://www.w3.org/TR/css-syntax-3/#starts-with-a-valid-escape
const isValidEscape = (next, nPlus1) => {
if (next !== '\\') {
return false;
}
if (nPlus1 === '\n') {
return false;
}
return true;
};
// https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
const startsIdentifier = (n, nPlus1, nPlus2) => {
if (n === '-') {
return (
isNameStartCodePoint(nPlus1) ||
nPlus1 === '-' ||
isValidEscape(nPlus1, nPlus2)
);
}
if (isNameStartCodePoint(n)) {
return true;
}
if (n === '\\' && isValidEscape(n, nPlus1)) {
return true;
}
return false;
};
// https://www.w3.org/TR/css-syntax-3/#starts-with-a-number
const startsNumber = (n, nPlus1, nPlus2) => {
if (n === '-' || n === '+') {
if (digitPattern.test(nPlus1)) {
return true;
}
if (nPlus1 === '.' && digitPattern.test(nPlus2)) {
return true;
}
return false;
}
if (n === '.') {
return digitPattern.test(nPlus1);
}
return digitPattern.test(n);
};
class CssFakeStream {
/**
* @type {string}
*/
css = '';
constructor(css = '') {
this.css = css;
}
/**
* @returns {string}
*/
get next() {
return this.css.charAt(0);
}
/**
* @returns {string}
*/
get nextPlus1() {
return this.css.charAt(1);
}
/**
* @returns {string}
*/
get nextPlus2() {
return this.css.charAt(2);
}
/**
* @returns {string}
*/
get nextPlus3() {
return this.css.charAt(3);
}
/**
* @returns {boolean}
*/
hasNext() {
return !isNaN(this.css.charCodeAt(0));
}
/**
* Removes a single characters from the start of the CSS stream then removes
* it.
* @returns {string}
*/
consume() {
const char = this.css.charAt(0);
this.css = this.css.substring(1);
return char;
}
/**
* Reconsumes a character (prepends to stream).
* @param {string} char
*/
reconsume(char) {
this.css = char + this.css;
}
}
/**
* https://www.w3.org/TR/css-syntax-3/#consume-comment
* This returns a token so the comment can be highlighted and rendered. In the
* spec, consuming comments would not return anything.
* @param {CssFakeStream} stream
*/
const consumeComment = stream => {
let reachedCommentEnd = false;
let comment = '';
let last = '';
do {
const char = stream.consume();
comment += char;
reachedCommentEnd = last === '*' && char === '/';
last = char;
} while (stream.hasNext() && !reachedCommentEnd);
return {
t: 'comment-token',
v: comment,
};
};
/**
* https://www.w3.org/TR/css-syntax-3/#consume-token
* @param {CssFakeStream} stream
*/
const consumeWhitespace = stream => {
let whitespace = '';
do {
whitespace += stream.consume();
} while (stream.hasNext() && isWhitespace(stream.next));
return {
t: 'whitespace-token',
v: whitespace,
};
};
/**
* https://www.w3.org/TR/css-syntax-3/#consume-a-name
* @param {CssFakeStream} stream
*/
const consumeName = stream => {
let result = '';
let char = '';
let nameEndReached = false;
do {
char = stream.consume();
if (isNameCodePoint(char)) {
result += char;
}
else if (isValidEscape(char, stream.next)) {
result += char + stream.consume();
}
else {
if (char) {
stream.reconsume(char);
}
nameEndReached = true;
}
} while (stream.hasNext() && !nameEndReached);
return result;
}
/**
* https://www.w3.org/TR/css-syntax-3/#ref-for-name-code-point
* @param {CssFakeStream} stream
*/
const consumeHash = stream => {
let hashToken = {
t: 'hash-token',
v: stream.consume(),
}
if (startsIdentifier(stream.next, stream.nextPlus1, stream.nextPlus2)) {
// In the spec, this would set a type flag to id, but we just need a new
// general type class for styling.
hashToken.t += '-id';
}
const name = consumeName(stream);
hashToken.v += name;
return hashToken;
}
/**
* Consumes a single character of the given token type.
* @param {CssFakeStream} stream
* @param {string} tokenType
*/
const consumeSingleChar = (stream, tokenType) => {
return {
t: tokenType,
v: stream.consume(),
}
}
/**
* https://www.w3.org/TR/css-syntax-3/#consume-a-string-token
* @param {CssFakeStream} stream
*/
const consumeString = stream => {
const endChar = stream.next;
let cache = stream.consume()
while (true) {
const currentChar = stream.consume();
if (currentChar === endChar) {
cache += currentChar;
return {
t: 'string-token',
v: cache,
}
}
// If following the spec, this would be a parse error
if (!stream.hasNext()) {
cache += currentChar;
return {
t: 'string-token',
v: cache,
}
}
// If following the spec, this would be a parse error
if (currentChar === '\n') {
stream.reconsume(currentChar);
return {
t: 'string-token',
v: cache,
}
}
if (currentChar === '\\') {
if (!stream.hasNext()) {
return null
}
// TODO: implement full consume escaped code point algorithm
// https://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
cache += currentChar + stream.consume()
}
else {
cache += currentChar
}
}
}
/**
* https://www.w3.org/TR/css-syntax-3/#consume-a-number
* @param {CssFakeStream} stream
* @returns {string}
*/
const consumeNumber = stream => {
// The specification deals with number types. However, we this implementation
// ignores that and just consumes number characters as long as it finds them.
// Initially set type to "integer". Let repr be the empty string.
let repr = ''
// If the next input code point is U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS
// (-), consume it and append it to repr.
if (stream.next === '+' || stream.next === '-') {
repr += stream.consume()
}
// While the next input code point is a digit, consume it and append it to
// repr.
while (stream.hasNext() && digitPattern.test(stream.next)) {
repr += stream.consume()
}
// If the next 2 input code points are U+002E FULL STOP (.) followed by a
// digit, then:
if (stream.next === '.' && digitPattern.test(stream.nextPlus1)) {
// Consume them.
// Append them to repr.
repr += stream.consume() // Consume '.'
repr += stream.consume() // Consume first digit
// While the next input code point is a digit, consume it and append it to
// repr.
while (stream.hasNext() && digitPattern.test(stream.next)) {
repr += stream.consume()
}
}
// If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E)
// or U+0065 LATIN SMALL LETTER E (e), optionally followed by U+002D
// HYPHEN-MINUS (-) or U+002B PLUS SIGN (+), followed by a digit, then
if (
(stream.next === 'E' || stream.next === 'e') &&
(digitPattern.test(stream.nextPlus1))
) {
repr += stream.consume() // Consume e character
repr += stream.consume() // Consume first digit
// While the next input code point is a digit, consume it and append it to
// repr.
while (stream.hasNext() && digitPattern.test(stream.next)) {
repr += stream.consume()
}
}
else if (
(stream.next === 'E' || stream.next === 'e') &&
(stream.nextPlus1 === '-' || stream.nextPlus1 === '+')
(digitPattern.test(stream.nextPlus2))
) {
repr += stream.consume() // Consume e character
repr += stream.consume() // Consume sign
repr += stream.consume() // Consume first digit
// While the next input code point is a digit, consume it and append it to
// repr.
while (stream.hasNext() && digitPattern.test(stream.next)) {
repr += stream.consume()
}
}
return repr;
}
/**
* https://www.w3.org/TR/css-syntax-3/#consume-a-numeric-token
* @param {CssFakeStream} stream
*/
const consumeNumeric = stream => {
// Consume a number and let number be the result.
let number = consumeNumber(stream);
// If the next 3 input code points would start an identifier, then:
if (startsIdentifier(stream.next, stream.nextPlus1, stream.nextPlus2)) {
// Create a <dimension-token> with the same value and type flag as number,
// and a unit set initially to the empty string.
const dimensionToken = {
t: 'dimension-token',
v: number,
}
// Consume a name. Set the <dimension-token>’s unit to the returned value.
dimensionToken.v += consumeName(stream)
// Return the <dimension-token>.
return dimensionToken
}
// Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%),
// consume it.
else if (stream.next === '%') {
// Create a <percentage-token> with the same value as number, and return it.
const percentageToken = {
t: 'percentage-token',
v: `${number}${stream.consume()}`,
}
return percentageToken
}
else {
// Otherwise, create a <number-token> with the same value and type flag as
// number, and return it.
return {
t: 'number-token',
v: number,
}
}
}
// https://www.w3.org/TR/css-syntax-3/#tokenizer-algorithms
const parse = cssStr => {
const tokens = [];
const stream = new CssFakeStream(cssStr);
let cache = '';
const consumePlainToken = () => {
if (cache) {
tokens.push({
t: 'plain-token',
v: cache,
});
cache = '';
}
};
const consumeToken = (consumeFunction, ...args) => {
consumePlainToken()
const token = consumeFunction(stream, ...args);
if (token) {
tokens.push(token);
}
}
while (stream.next !== '') {
if (stream.next === '/' && stream.nextPlus1 === '*') {
consumeToken(consumeComment)
} else if (isWhitespace(stream.next)) {
consumeToken(consumeWhitespace);
} else if (isQuotationMark(stream.next)) {
consumeToken(consumeString);
} else if (stream.next === '#') {
if (
isNameCodePoint(stream.nextPlus1) ||
isValidEscape(stream.nextPlus1, stream.nextPlus2)
) {
consumeToken(consumeHash);
}
else {
consumeToken(consumeSingleChar, 'delim-token');
}
} else if (stream.next === '(') {
consumeToken(consumeSingleChar, 'left-parentheses-token')
} else if (stream.next === ')') {
consumeToken(consumeSingleChar, 'right-parentheses-token')
} else if (stream.next === '+') {
if (startsNumber(stream.nextPlus1, stream.nextPlus2, stream.nextPlus3)) {
consumeToken(consumeNumeric)
}
else {
consumeToken(consumeSingleChar, 'delim-token')
}
} else {
cache += stream.consume();
}
}
consumePlainToken();
return tokens;
};
console.log(
JSON.stringify(
parse(`/* c */
#w{width:calc(+100%);content:'kfdl';}`), null, 2
)
);