A naïve attempt at parsing CSS in JavaScript – Part 9: parsing numbers

2020-03-03

This iteration of developing the parse was quite fun because the specification turns into code nearly word-for-word. I included comments in the consumeNumber function which use the wording of the CSS tokenizer specification alongside the code interpretation of specification.

It is not a perfect implementation of the specification, as this naive tokeniser only has the goal to highlight CSS text in a webpage, not actually use the CSS for rendering purposes.

Here it the consumeNumber function snippet:

/**
 * https://www.w3.org/TR/css-syntax-3/#consume-a-number
 * @param {CssFakeStream} stream 
 * @returns {string}
 */
const consumeNumber = stream => {

	// The specification deals with number types. However, we this implementation
	// ignores that and just consumes number characters as long as it finds them.
	// Initially set type to "integer". Let repr be the empty string. 
	let repr = ''

	// If the next input code point is U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS
	// (-), consume it and append it to repr. 
	if (stream.next === '+' || stream.next === '-') {
		repr += stream.consume()
	}

	// While the next input code point is a digit, consume it and append it to
	// repr.
	while (stream.hasNext() && digitPattern.test(stream.next)) {
		repr += stream.consume()
	}

	// If the next 2 input code points are U+002E FULL STOP (.) followed by a
	// digit, then: 
	if (stream.next === '.' && digitPattern.test(stream.nextPlus1)) {

		// Consume them.
		// Append them to repr. 
		repr += stream.consume() // Consume '.'
		repr += stream.consume() // Consume first digit

		// While the next input code point is a digit, consume it and append it to
		// repr. 
		while (stream.hasNext() && digitPattern.test(stream.next)) {
			repr += stream.consume()
		}
	}

	// If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E)
	// or U+0065 LATIN SMALL LETTER E (e), optionally followed by U+002D
	// HYPHEN-MINUS (-) or U+002B PLUS SIGN (+), followed by a digit, then
	if (
		(stream.next === 'E' || stream.next === 'e') &&
		(digitPattern.test(stream.nextPlus1))
	) {
		repr += stream.consume() // Consume e character
		repr += stream.consume() // Consume first digit

		// While the next input code point is a digit, consume it and append it to
		// repr. 
		while (stream.hasNext() && digitPattern.test(stream.next)) {
			repr += stream.consume()
		}
	}
	else if (
		(stream.next === 'E' || stream.next === 'e') &&
		(stream.nextPlus1 === '-' || stream.nextPlus1 === '+')
		(digitPattern.test(stream.nextPlus2))
	) {
		repr += stream.consume() // Consume e character
		repr += stream.consume() // Consume sign
		repr += stream.consume() // Consume first digit

		// While the next input code point is a digit, consume it and append it to
		// repr. 
		while (stream.hasNext() && digitPattern.test(stream.next)) {
			repr += stream.consume()
		}
	}
	return repr;
}

When running the full code on CSS that looks like this:

/* c */
#w{width:calc(+100%);content:'kfdl';}

The result is:

[
  { "t": "comment-token",          "v": "/* c */" },
  { "t": "whitespace-token",       "v": "\n" },
  { "t": "hash-token-id",          "v": "#w" },
  { "t": "plain-token",            "v": "{width:calc" },
  { "t": "left-parentheses-token", "v": "(" },
  { "t": "percentage-token",       "v": "+100%" },
  { "t": "right-parentheses-token","v": ")" },
  { "t": "plain-token",            "v": ";content:" },
  { "t": "string-token",           "v": "'kfdl'" },
  { "t": "plain-token",            "v": ";}" }
]

Still a few items which are currently plain-token items to consume as tokens based on the specification, but it’s starting to look like a good number of tokens based on the specification.

Here is the full code:

const letterPattern = /[a-zA-Z]/;
const digitPattern = /[0-9]/;
const isNonAscii = char => char.charCodeAt(0) > 128;
const isWhitespace = char => char === ' ' || char === '\n' || char === '\t';
const isQuotationMark = char => char === '"' || char === "'";

// https://www.w3.org/TR/css-syntax-3/#name-start-code-point
const isNameStartCodePoint = char => {
	const result = char === '_' || isNonAscii(char) || letterPattern.test(char);
	return result;
};

// https://www.w3.org/TR/css-syntax-3/#name-code-point
const isNameCodePoint = char => {
	const result =
		char === '-' || digitPattern.test(char) || isNameStartCodePoint(char);
	return result;
};

// https://www.w3.org/TR/css-syntax-3/#starts-with-a-valid-escape
const isValidEscape = (next, nPlus1) => {
	if (next !== '\\') {
		return false;
	}
	if (nPlus1 === '\n') {
		return false;
	}
	return true;
};

// https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
const startsIdentifier = (n, nPlus1, nPlus2) => {
	if (n === '-') {
		return (
			isNameStartCodePoint(nPlus1) ||
			nPlus1 === '-' ||
			isValidEscape(nPlus1, nPlus2)
		);
	}
	if (isNameStartCodePoint(n)) {
		return true;
	}
	if (n === '\\' && isValidEscape(n, nPlus1)) {
		return true;
	}
	return false;
};

// https://www.w3.org/TR/css-syntax-3/#starts-with-a-number
const startsNumber = (n, nPlus1, nPlus2) => {
	if (n === '-' || n === '+') {
		if (digitPattern.test(nPlus1)) {
			return true;
		}
		if (nPlus1 === '.' && digitPattern.test(nPlus2)) {
			return true;
		}
		return false;
	}
	if (n === '.') {
		return digitPattern.test(nPlus1);
	}
	return digitPattern.test(n);
};

class CssFakeStream {
	/**
	 * @type {string}
	 */
	css = '';

	constructor(css = '') {
		this.css = css;
	}

	/**
	 * @returns {string}
	 */
	get next() {
		return this.css.charAt(0);
	}

	/**
	 * @returns {string}
	 */
	get nextPlus1() {
		return this.css.charAt(1);
	}

	/**
	 * @returns {string}
	 */
	get nextPlus2() {
		return this.css.charAt(2);
	}

	/**
	 * @returns {string}
	 */
	get nextPlus3() {
		return this.css.charAt(3);
	}

	/**
	 * @returns {boolean}
	 */
	hasNext() {
		return !isNaN(this.css.charCodeAt(0));
	}

	/**
	 * Removes a single characters from the start of the CSS stream then removes
	 * it.
	 * @returns {string}
	 */
	consume() {
		const char = this.css.charAt(0);
		this.css = this.css.substring(1);
		return char;
	}

	/**
	 * Reconsumes a character (prepends to stream).
	 * @param {string} char 
	 */
	reconsume(char) {
		this.css = char + this.css;
	}
}

/**
 * https://www.w3.org/TR/css-syntax-3/#consume-comment
 * This returns a token so the comment can be highlighted and rendered. In the
 * spec, consuming comments would not return anything.
 * @param {CssFakeStream} stream
 */
const consumeComment = stream => {
	let reachedCommentEnd = false;
	let comment = '';
	let last = '';
	do {
		const char = stream.consume();
		comment += char;
		reachedCommentEnd = last === '*' && char === '/';
		last = char;
	} while (stream.hasNext() && !reachedCommentEnd);
	return {
		t: 'comment-token',
		v: comment,
	};
};

/**
 * https://www.w3.org/TR/css-syntax-3/#consume-token
 * @param {CssFakeStream} stream
 */
const consumeWhitespace = stream => {
	let whitespace = '';
	do {
		whitespace += stream.consume();
	} while (stream.hasNext() && isWhitespace(stream.next));
	return {
		t: 'whitespace-token',
		v: whitespace,
	};
};

/**
 * https://www.w3.org/TR/css-syntax-3/#consume-a-name
 * @param {CssFakeStream} stream 
 */
const consumeName = stream => {
	let result = '';
	let char = '';
	let nameEndReached = false;
	do {
		char = stream.consume();
		if (isNameCodePoint(char)) {
			result += char;
		}
		else if (isValidEscape(char, stream.next)) {
			result += char + stream.consume();
		}
		else {
			if (char) {
				stream.reconsume(char);
			}
			nameEndReached = true;
		}
	} while (stream.hasNext() && !nameEndReached);
	return result;
}

/**
 * https://www.w3.org/TR/css-syntax-3/#ref-for-name-code-point
 * @param {CssFakeStream} stream 
 */
const consumeHash = stream => {
	let hashToken = {
		t: 'hash-token',
		v: stream.consume(),
	}
	if (startsIdentifier(stream.next, stream.nextPlus1, stream.nextPlus2)) {
		// In the spec, this would set a type flag to id, but we just need a new
		// general type class for styling.
		hashToken.t += '-id';
	}
	const name = consumeName(stream);
	hashToken.v += name;
	return hashToken;
}

/**
 * Consumes a single character of the given token type.
 * @param {CssFakeStream} stream 
 * @param {string} tokenType 
 */
const consumeSingleChar = (stream, tokenType) => {
	return {
		t: tokenType,
		v: stream.consume(),
	}
}

/**
 * https://www.w3.org/TR/css-syntax-3/#consume-a-string-token
 * @param {CssFakeStream} stream 
 */
const consumeString = stream => {
	const endChar = stream.next;
	let cache = stream.consume()
	while (true) {
		const currentChar = stream.consume();
		if (currentChar === endChar) {
			cache += currentChar;
			return {
				t: 'string-token',
				v: cache,
			}
		}
		// If following the spec, this would be a parse error
		if (!stream.hasNext()) {
			cache += currentChar;
			return {
				t: 'string-token',
				v: cache,
			}
		}
		// If following the spec, this would be a parse error
		if (currentChar === '\n') {
			stream.reconsume(currentChar);
			return {
				t: 'string-token',
				v: cache,
			}
		}
		if (currentChar === '\\') {
			if (!stream.hasNext()) {
				return null
			}
			// TODO: implement full consume escaped code point algorithm
			// https://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
			cache += currentChar + stream.consume()
		}
		else {
			cache += currentChar
		}
	}
}

/**
 * https://www.w3.org/TR/css-syntax-3/#consume-a-number
 * @param {CssFakeStream} stream 
 * @returns {string}
 */
const consumeNumber = stream => {

	// The specification deals with number types. However, we this implementation
	// ignores that and just consumes number characters as long as it finds them.
	// Initially set type to "integer". Let repr be the empty string. 
	let repr = ''

	// If the next input code point is U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS
	// (-), consume it and append it to repr. 
	if (stream.next === '+' || stream.next === '-') {
		repr += stream.consume()
	}

	// While the next input code point is a digit, consume it and append it to
	// repr.
	while (stream.hasNext() && digitPattern.test(stream.next)) {
		repr += stream.consume()
	}

	// If the next 2 input code points are U+002E FULL STOP (.) followed by a
	// digit, then: 
	if (stream.next === '.' && digitPattern.test(stream.nextPlus1)) {

		// Consume them.
		// Append them to repr. 
		repr += stream.consume() // Consume '.'
		repr += stream.consume() // Consume first digit

		// While the next input code point is a digit, consume it and append it to
		// repr. 
		while (stream.hasNext() && digitPattern.test(stream.next)) {
			repr += stream.consume()
		}
	}

	// If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E)
	// or U+0065 LATIN SMALL LETTER E (e), optionally followed by U+002D
	// HYPHEN-MINUS (-) or U+002B PLUS SIGN (+), followed by a digit, then
	if (
		(stream.next === 'E' || stream.next === 'e') &&
		(digitPattern.test(stream.nextPlus1))
	) {
		repr += stream.consume() // Consume e character
		repr += stream.consume() // Consume first digit

		// While the next input code point is a digit, consume it and append it to
		// repr. 
		while (stream.hasNext() && digitPattern.test(stream.next)) {
			repr += stream.consume()
		}
	}
	else if (
		(stream.next === 'E' || stream.next === 'e') &&
		(stream.nextPlus1 === '-' || stream.nextPlus1 === '+')
		(digitPattern.test(stream.nextPlus2))
	) {
		repr += stream.consume() // Consume e character
		repr += stream.consume() // Consume sign
		repr += stream.consume() // Consume first digit

		// While the next input code point is a digit, consume it and append it to
		// repr. 
		while (stream.hasNext() && digitPattern.test(stream.next)) {
			repr += stream.consume()
		}
	}
	return repr;
}

/**
 * https://www.w3.org/TR/css-syntax-3/#consume-a-numeric-token
 * @param {CssFakeStream} stream 
 */
const consumeNumeric = stream => {

	// Consume a number and let number be the result.
	let number = consumeNumber(stream);

	// If the next 3 input code points would start an identifier, then:
	if (startsIdentifier(stream.next, stream.nextPlus1, stream.nextPlus2)) {

		// Create a <dimension-token> with the same value and type flag as number,
		// and a unit set initially to the empty string. 
		const dimensionToken = {
			t: 'dimension-token',
			v: number,
		}

		// Consume a name. Set the <dimension-token>’s unit to the returned value.
		dimensionToken.v += consumeName(stream)

		// Return the <dimension-token>.
		return dimensionToken
	}

	// Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%),
	// consume it.
	else if (stream.next === '%') {
		
		// Create a <percentage-token> with the same value as number, and return it.
		const percentageToken = {
			t: 'percentage-token',
			v: `${number}${stream.consume()}`,
		}
		return percentageToken
	}
	else {

		// Otherwise, create a <number-token> with the same value and type flag as
		// number, and return it.
		return {
			t: 'number-token',
			v: number,
		}
	}
}

// https://www.w3.org/TR/css-syntax-3/#tokenizer-algorithms
const parse = cssStr => {
	const tokens = [];
	const stream = new CssFakeStream(cssStr);
	let cache = '';

	const consumePlainToken = () => {
		if (cache) {
			tokens.push({
				t: 'plain-token',
				v: cache,
			});
			cache = '';
		}
	};

	const consumeToken = (consumeFunction, ...args) => {
		consumePlainToken()
		const token = consumeFunction(stream, ...args);
		if (token) {
			tokens.push(token);
		}
	}

	while (stream.next !== '') {
		if (stream.next === '/' && stream.nextPlus1 === '*') {
			consumeToken(consumeComment)
		} else if (isWhitespace(stream.next)) {
			consumeToken(consumeWhitespace);
		} else if (isQuotationMark(stream.next)) {
			consumeToken(consumeString);
		} else if (stream.next === '#') {
			if (
				isNameCodePoint(stream.nextPlus1) ||
				isValidEscape(stream.nextPlus1, stream.nextPlus2)
			) {
				consumeToken(consumeHash);
			}
			else {
				consumeToken(consumeSingleChar, 'delim-token');
			}
		} else if (stream.next === '(') {
			consumeToken(consumeSingleChar, 'left-parentheses-token')
		} else if (stream.next === ')') {
			consumeToken(consumeSingleChar, 'right-parentheses-token')
		} else if (stream.next === '+') {
			if (startsNumber(stream.nextPlus1, stream.nextPlus2, stream.nextPlus3)) {
				consumeToken(consumeNumeric)
			}
			else {
				consumeToken(consumeSingleChar, 'delim-token')
			}
		} else {
			cache += stream.consume();
		}
	}
	consumePlainToken();
	return tokens;
};

console.log(
	JSON.stringify(
		parse(`/* c */
#w{width:calc(+100%);content:'kfdl';}`), null, 2
	)
);