/*global Autolinker */
/**
* @private
* @class Autolinker.htmlParser.HtmlParser
* @extends Object
*
* An HTML parser implementation which simply walks an HTML string and returns an array of
* {@link Autolinker.htmlParser.HtmlNode HtmlNodes} that represent the basic HTML structure of the input string.
*
* Autolinker uses this to only link URLs/emails/Twitter handles within text nodes, effectively ignoring / "walking
* around" HTML tags.
*/
Autolinker.htmlParser.HtmlParser = Autolinker.Util.extend( Object, {
/**
* @private
* @property {RegExp} htmlRegex
*
* The regular expression used to pull out HTML tags from a string. Handles namespaced HTML tags and
* attribute names, as specified by http://www.w3.org/TR/html-markup/syntax.html.
*
* Capturing groups:
*
* 1. The "!DOCTYPE" tag name, if a tag is a <!DOCTYPE> tag.
* 2. If it is an end tag, this group will have the '/'.
* 3. If it is a comment tag, this group will hold the comment text (i.e.
* the text inside the `<!--` and `-->`.
* 4. The tag name for all tags (other than the <!DOCTYPE> tag)
*/
htmlRegex : (function() {
var commentTagRegex = /!--([\s\S]+?)--/,
tagNameRegex = /[0-9a-zA-Z][0-9a-zA-Z:]*/,
attrNameRegex = /[^\s\0"'>\/=\x01-\x1F\x7F]+/, // the unicode range accounts for excluding control chars, and the delete char
attrValueRegex = /(?:"[^"]*?"|'[^']*?'|[^'"=<>`\s]+)/, // double quoted, single quoted, or unquoted attribute values
nameEqualsValueRegex = attrNameRegex.source + '(?:\\s*=\\s*' + attrValueRegex.source + ')?'; // optional '=[value]'
return new RegExp( [
// for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
'(?:',
'<(!DOCTYPE)', // *** Capturing Group 1 - If it's a doctype tag
// Zero or more attributes following the tag name
'(?:',
'\\s+', // one or more whitespace chars before an attribute
// Either:
// A. attr="value", or
// B. "value" alone (To cover example doctype tag: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">)
'(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')',
')*',
'>',
')',
'|',
// All other HTML tags (i.e. tags that are not <!DOCTYPE>)
'(?:',
'<(/)?', // Beginning of a tag or comment. Either '<' for a start tag, or '</' for an end tag.
// *** Capturing Group 2: The slash or an empty string. Slash ('/') for end tag, empty string for start or self-closing tag.
'(?:',
commentTagRegex.source, // *** Capturing Group 3 - A Comment Tag's Text
'|',
'(?:',
// *** Capturing Group 4 - The tag name
'(' + tagNameRegex.source + ')',
// Zero or more attributes following the tag name
'(?:',
'\\s+', // one or more whitespace chars before an attribute
nameEqualsValueRegex, // attr="value" (with optional ="value" part)
')*',
'\\s*/?', // any trailing spaces and optional '/' before the closing '>'
')',
')',
'>',
')'
].join( "" ), 'gi' );
} )(),
/**
* @private
* @property {RegExp} htmlCharacterEntitiesRegex
*
* The regular expression that matches common HTML character entities.
*
* Ignoring & as it could be part of a query string -- handling it separately.
*/
htmlCharacterEntitiesRegex: /( | |<|<|>|>|"|"|')/gi,
/**
* Parses an HTML string and returns a simple array of {@link Autolinker.htmlParser.HtmlNode HtmlNodes}
* to represent the HTML structure of the input string.
*
* @param {String} html The HTML to parse.
* @return {Autolinker.htmlParser.HtmlNode[]}
*/
parse : function( html ) {
var htmlRegex = this.htmlRegex,
currentResult,
lastIndex = 0,
textAndEntityNodes,
nodes = []; // will be the result of the method
while( ( currentResult = htmlRegex.exec( html ) ) !== null ) {
var tagText = currentResult[ 0 ],
commentText = currentResult[ 3 ], // if we've matched a comment
tagName = currentResult[ 1 ] || currentResult[ 4 ], // The <!DOCTYPE> tag (ex: "!DOCTYPE"), or another tag (ex: "a" or "img")
isClosingTag = !!currentResult[ 2 ],
inBetweenTagsText = html.substring( lastIndex, currentResult.index );
// Push TextNodes and EntityNodes for any text found between tags
if( inBetweenTagsText ) {
textAndEntityNodes = this.parseTextAndEntityNodes( inBetweenTagsText );
nodes.push.apply( nodes, textAndEntityNodes );
}
// Push the CommentNode or ElementNode
if( commentText ) {
nodes.push( this.createCommentNode( tagText, commentText ) );
} else {
nodes.push( this.createElementNode( tagText, tagName, isClosingTag ) );
}
lastIndex = currentResult.index + tagText.length;
}
// Process any remaining text after the last HTML element. Will process all of the text if there were no HTML elements.
if( lastIndex < html.length ) {
var text = html.substring( lastIndex );
// Push TextNodes and EntityNodes for any text found between tags
if( text ) {
textAndEntityNodes = this.parseTextAndEntityNodes( text );
nodes.push.apply( nodes, textAndEntityNodes );
}
}
return nodes;
},
/**
* Parses text and HTML entity nodes from a given string. The input string
* should not have any HTML tags (elements) within it.
*
* @private
* @param {String} text The text to parse.
* @return {Autolinker.htmlParser.HtmlNode[]} An array of HtmlNodes to
* represent the {@link Autolinker.htmlParser.TextNode TextNodes} and
* {@link Autolinker.htmlParser.EntityNode EntityNodes} found.
*/
parseTextAndEntityNodes : function( text ) {
var nodes = [],
textAndEntityTokens = Autolinker.Util.splitAndCapture( text, this.htmlCharacterEntitiesRegex ); // split at HTML entities, but include the HTML entities in the results array
// Every even numbered token is a TextNode, and every odd numbered token is an EntityNode
// For example: an input `text` of "Test "this" today" would turn into the
// `textAndEntityTokens`: [ 'Test ', '"', 'this', '"', ' today' ]
for( var i = 0, len = textAndEntityTokens.length; i < len; i += 2 ) {
var textToken = textAndEntityTokens[ i ],
entityToken = textAndEntityTokens[ i + 1 ];
if( textToken ) nodes.push( this.createTextNode( textToken ) );
if( entityToken ) nodes.push( this.createEntityNode( entityToken ) );
}
return nodes;
},
* Factory method to create an {@link Autolinker.htmlParser.CommentNode CommentNode}.
*
* @private
* @param {String} tagText The full text of the tag (comment) that was
* matched, including its <!-- and -->.
* @param {String} comment The full text of the comment that was matched.
*/
createCommentNode : function( tagText, commentText ) {
return new Autolinker.htmlParser.CommentNode( {
text: tagText,
comment: Autolinker.Util.trim( commentText )
} );
},
/**
* Factory method to create an {@link Autolinker.htmlParser.ElementNode ElementNode}.
*
* @private
* @param {String} tagText The full text of the tag (element) that was
* matched, including its attributes.
* @param {String} tagName The name of the tag. Ex: An <img> tag would
* be passed to this method as "img".
* @param {Boolean} isClosingTag `true` if it's a closing tag, false
* otherwise.
* @return {Autolinker.htmlParser.ElementNode}
*/
createElementNode : function( tagText, tagName, isClosingTag ) {
return new Autolinker.htmlParser.ElementNode( {
text : tagText,
tagName : tagName.toLowerCase(),
closing : isClosingTag
} );
},
/**
* Factory method to create a {@link Autolinker.htmlParser.EntityNode EntityNode}.
*
* @private
* @param {String} text The text that was matched for the HTML entity (such
* as '&nbsp;').
* @return {Autolinker.htmlParser.EntityNode}
*/
createEntityNode : function( text ) {
return new Autolinker.htmlParser.EntityNode( { text: text } );
},
/**
* Factory method to create a {@link Autolinker.htmlParser.TextNode TextNode}.
*
* @private
* @param {String} text The text that was matched.
* @return {Autolinker.htmlParser.TextNode}
*/
createTextNode : function( text ) {
return new Autolinker.htmlParser.TextNode( { text: text } );
}
} );