/*global Autolinker */
/**
* @private
* @class Autolinker.matchParser.MatchParser
* @extends Object
*
* Used by Autolinker to parse potential matches, given an input string of text.
*
* The MatchParser is fed a non-HTML string in order to search for matches.
* Autolinker first uses the {@link Autolinker.htmlParser.HtmlParser} to "walk
* around" HTML tags, and then the text around the HTML tags is passed into the
* MatchParser in order to find the actual matches.
*/
Autolinker.matchParser.MatchParser = Autolinker.Util.extend( Object, {
/**
* @cfg {Boolean} urls
* @inheritdoc Autolinker#urls
*/
urls : true,
/**
* @cfg {Boolean} email
* @inheritdoc Autolinker#email
*/
email : true,
* @cfg {Boolean} twitter
* @inheritdoc Autolinker#twitter
*/
twitter : true,
/**
* @cfg {Boolean} phone
* @inheritdoc Autolinker#phone
*/
phone: true,
/**
* @cfg {Boolean/String} hashtag
* @inheritdoc Autolinker#hashtag
*/
hashtag : false,
/**
* @cfg {Boolean} stripPrefix
* @inheritdoc Autolinker#stripPrefix
*/
stripPrefix : true,
/**
* @private
* @property {RegExp} matcherRegex
*
* The regular expression that matches URLs, email addresses, phone #s,
* Twitter handles, and Hashtags.
*
* This regular expression has the following capturing groups:
*
* 1. Group that is used to determine if there is a Twitter handle match
* (i.e. \@someTwitterUser). Simply check for its existence to determine
* if there is a Twitter handle match. The next couple of capturing
* groups give information about the Twitter handle match.
* 2. The whitespace character before the \@sign in a Twitter handle. This
* is needed because there are no lookbehinds in JS regular expressions,
* and can be used to reconstruct the original string in a replace().
* 3. The Twitter handle itself in a Twitter match. If the match is
* '@someTwitterUser', the handle is 'someTwitterUser'.
* 4. Group that matches an email address. Used to determine if the match
* is an email address, as well as holding the full address. Ex:
* 'me@my.com'
* 5. Group that matches a URL in the input text. Ex: 'http://google.com',
* 'www.google.com', or just 'google.com'. This also includes a path,
* url parameters, or hash anchors. Ex: google.com/path/to/file?q1=1&q2=2#myAnchor
* 6. Group that matches a protocol URL (i.e. 'http://google.com'). This is
* used to match protocol URLs with just a single word, like 'http://localhost',
* where we won't double check that the domain name has at least one '.'
* in it.
* 7. A protocol-relative ('//') match for the case of a 'www.' prefixed
* URL. Will be an empty string if it is not a protocol-relative match.
* We need to know the character before the '//' in order to determine
* if it is a valid match or the // was in a string we don't want to
* auto-link.
* 8. A protocol-relative ('//') match for the case of a known TLD prefixed
* URL. Will be an empty string if it is not a protocol-relative match.
* See #6 for more info.
* 9. Group that is used to determine if there is a phone number match. The
* next 3 groups give segments of the phone number.
* 10. Group that is used to determine if there is a Hashtag match
* (i.e. \#someHashtag). Simply check for its existence to determine if
* there is a Hashtag match. The next couple of capturing groups give
* information about the Hashtag match.
* 11. The whitespace character before the #sign in a Hashtag handle. This
* is needed because there are no look-behinds in JS regular
* expressions, and can be used to reconstruct the original string in a
* replace().
* 12. The Hashtag itself in a Hashtag match. If the match is
* '#someHashtag', the hashtag is 'someHashtag'.
*/
matcherRegex : (function() {
var twitterRegex = /(^|[^\w])@(\w{1,15})/, // For matching a twitter handle. Ex: @gregory_jacobs
hashtagRegex = /(^|[^\w])#(\w{1,139})/, // For matching a Hashtag. Ex: #games
emailRegex = /(?:[\-;:&=\+\$,\w\.]+@)/, // something@ for email addresses (a.k.a. local-part)
phoneRegex = /(?:\+?\d{1,3}[-\040.])?\(?\d{3}\)?[-\040.]?\d{3}[-\040.]\d{4}/, // ex: (123) 456-7890, 123 456 7890, 123-456-7890, etc.
protocolRegex = /(?:[A-Za-z][-.+A-Za-z0-9]*:(?![A-Za-z][-.+A-Za-z0-9]*:\/\/)(?!\d+\/?)(?:\/\/)?)/, // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:"). Also, make sure we don't interpret 'google.com:8000' as if 'google.com' was a protocol here (i.e. ignore a trailing port number in this regex)
wwwRegex = /(?:www\.)/, // starting with 'www.'
domainNameRegex = /[A-Za-z0-9\.\-]*[A-Za-z0-9\-]/, // anything looking at all like a domain, non-unicode domains, not ending in a period
tldRegex = /\.(?:international|construction|contractors|enterprises|photography|productions|foundation|immobilien|industries|management|properties|technology|christmas|community|directory|education|equipment|institute|marketing|solutions|vacations|bargains|boutique|builders|catering|cleaning|clothing|computer|democrat|diamonds|graphics|holdings|lighting|partners|plumbing|supplies|training|ventures|academy|careers|company|cruises|domains|exposed|flights|florist|gallery|guitars|holiday|kitchen|neustar|okinawa|recipes|rentals|reviews|shiksha|singles|support|systems|agency|berlin|camera|center|coffee|condos|dating|estate|events|expert|futbol|kaufen|luxury|maison|monash|museum|nagoya|photos|repair|report|social|supply|tattoo|tienda|travel|viajes|villas|vision|voting|voyage|actor|build|cards|cheap|codes|dance|email|glass|house|mango|ninja|parts|photo|press|shoes|solar|today|tokyo|tools|watch|works|aero|arpa|asia|best|bike|blue|buzz|camp|club|cool|coop|farm|fish|gift|guru|info|jobs|kiwi|kred|land|limo|link|menu|mobi|moda|name|pics|pink|post|qpon|rich|ruhr|sexy|tips|vote|voto|wang|wien|wiki|zone|bar|bid|biz|cab|cat|ceo|com|edu|gov|int|kim|mil|net|onl|org|pro|pub|red|tel|uno|wed|xxx|xyz|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)\b/, // match our known top level domains (TLDs)
// Allow optional path, query string, and hash anchor, not ending in the following characters: "?!:,.;"
// http://blog.codinghorror.com/the-problem-with-urls/
urlSuffixRegex = /[\-A-Za-z0-9+&@#\/%=~_()|'$*\[\]?!:,.;]*[\-A-Za-z0-9+&@#\/%=~_()|'$*\[\]]/;
return new RegExp( [
'(', // *** Capturing group $1, which can be used to check for a twitter handle match. Use group $3 for the actual twitter handle though. $2 may be used to reconstruct the original string in a replace()
// *** Capturing group $2, which matches the whitespace character before the '@' sign (needed because of no lookbehinds), and
// *** Capturing group $3, which matches the actual twitter handle
twitterRegex.source,
')',
'|',
'(', // *** Capturing group $4, which is used to determine an email match
emailRegex.source,
domainNameRegex.source,
tldRegex.source,
')',
'|',
'(', // *** Capturing group $5, which is used to match a URL
'(?:', // parens to cover match for protocol (optional), and domain
'(', // *** Capturing group $6, for a protocol-prefixed url (ex: http://google.com)
protocolRegex.source,
domainNameRegex.source,
')',
'|',
'(?:', // non-capturing paren for a 'www.' prefixed url (ex: www.google.com)
'(.?//)?', // *** Capturing group $7 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character
wwwRegex.source,
domainNameRegex.source,
')',
'|',
'(?:', // non-capturing paren for known a TLD url (ex: google.com)
'(.?//)?', // *** Capturing group $8 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character
domainNameRegex.source,
tldRegex.source,
')',
')',
'(?:' + urlSuffixRegex.source + ')?', // match for path, query string, and/or hash anchor - optional
')',
'|',
// this setup does not scale well for open extension :( Need to rethink design of autolinker...
// *** Capturing group $9, which matches a (USA for now) phone number
'(',
phoneRegex.source,
')',
'|',
'(', // *** Capturing group $10, which can be used to check for a Hashtag match. Use group $12 for the actual Hashtag though. $11 may be used to reconstruct the original string in a replace()
// *** Capturing group $11, which matches the whitespace character before the '#' sign (needed because of no lookbehinds), and
// *** Capturing group $12, which matches the actual Hashtag
hashtagRegex.source,
')'
].join( "" ), 'gi' );
} )(),
/**
* @private
* @property {RegExp} charBeforeProtocolRelMatchRegex
*
* The regular expression used to retrieve the character before a
* protocol-relative URL match.
*
* This is used in conjunction with the {@link #matcherRegex}, which needs
* to grab the character before a protocol-relative '//' due to the lack of
* a negative look-behind in JavaScript regular expressions. The character
* before the match is stripped from the URL.
*/
charBeforeProtocolRelMatchRegex : /^(.)?\/\//,
/**
* @private
* @property {Autolinker.MatchValidator} matchValidator
*
* The MatchValidator object, used to filter out any false positives from
* the {@link #matcherRegex}. See {@link Autolinker.MatchValidator} for details.
*/
/**
* @constructor
* @param {Object} [cfg] The configuration options for the AnchorTagBuilder
* instance, specified in an Object (map).
*/
constructor : function( cfg ) {
Autolinker.Util.assign( this, cfg );
this.matchValidator = new Autolinker.MatchValidator();
},
/**
* Parses the input `text` to search for matches, and calls the `replaceFn`
* to allow replacements of the matches. Returns the `text` with matches
* replaced.
*
* @param {String} text The text to search and repace matches in.
* @param {Function} replaceFn The iterator function to handle the
* replacements. The function takes a single argument, a {@link Autolinker.match.Match}
* object, and should return the text that should make the replacement.
* @param {Object} [contextObj=window] The context object ("scope") to run
* the `replaceFn` in.
* @return {String}
*/
replace : function( text, replaceFn, contextObj ) {
var me = this; // for closure
return text.replace( this.matcherRegex, function( matchStr, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12 ) {
var matchDescObj = me.processCandidateMatch( matchStr, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12 ); // "match description" object
// Return out with no changes for match types that are disabled (url,
// email, phone, etc.), or for matches that are invalid (false
// positives from the matcherRegex, which can't use look-behinds
// since they are unavailable in JS).
if( !matchDescObj ) {
return matchStr;
} else {
// Generate replacement text for the match from the `replaceFn`
var replaceStr = replaceFn.call( contextObj, matchDescObj.match );
return matchDescObj.prefixStr + replaceStr + matchDescObj.suffixStr;
}
} );
},
/**
* Processes a candidate match from the {@link #matcherRegex}.
*
* Not all matches found by the regex are actual URL/Email/Phone/Twitter/Hashtag
* matches, as determined by the {@link #matchValidator}. In this case, the
* method returns `null`. Otherwise, a valid Object with `prefixStr`,
* `match`, and `suffixStr` is returned.
*
* @private
* @param {String} matchStr The full match that was found by the
* {@link #matcherRegex}.
* @param {String} twitterMatch The matched text of a Twitter handle, if the
* match is a Twitter match.
* @param {String} twitterHandlePrefixWhitespaceChar The whitespace char
* before the @ sign in a Twitter handle match. This is needed because of
* no lookbehinds in JS regexes, and is need to re-include the character
* for the anchor tag replacement.
* @param {String} twitterHandle The actual Twitter user (i.e the word after
* the @ sign in a Twitter match).
* @param {String} emailAddressMatch The matched email address for an email
* address match.
* @param {String} urlMatch The matched URL string for a URL match.
* @param {String} protocolUrlMatch The match URL string for a protocol
* match. Ex: 'http://yahoo.com'. This is used to match something like
* 'http://localhost', where we won't double check that the domain name
* has at least one '.' in it.
* @param {String} wwwProtocolRelativeMatch The '//' for a protocol-relative
* match from a 'www' url, with the character that comes before the '//'.
* @param {String} tldProtocolRelativeMatch The '//' for a protocol-relative
* match from a TLD (top level domain) match, with the character that
* comes before the '//'.
* @param {String} phoneMatch The matched text of a phone number
* @param {String} hashtagMatch The matched text of a Twitter
* Hashtag, if the match is a Hashtag match.
* @param {String} hashtagPrefixWhitespaceChar The whitespace char
* before the # sign in a Hashtag match. This is needed because of no
* lookbehinds in JS regexes, and is need to re-include the character for
* the anchor tag replacement.
* @param {String} hashtag The actual Hashtag (i.e the word
* after the # sign in a Hashtag match).
*
* @return {Object} A "match description object". This will be `null` if the
* match was invalid, or if a match type is disabled. Otherwise, this will
* be an Object (map) with the following properties:
* @return {String} return.prefixStr The char(s) that should be prepended to
* the replacement string. These are char(s) that were needed to be
* included from the regex match that were ignored by processing code, and
* should be re-inserted into the replacement stream.
* @return {String} return.suffixStr The char(s) that should be appended to
* the replacement string. These are char(s) that were needed to be
* included from the regex match that were ignored by processing code, and
* should be re-inserted into the replacement stream.
* @return {Autolinker.match.Match} return.match The Match object that
* represents the match that was found.
*/
processCandidateMatch : function(
matchStr, twitterMatch, twitterHandlePrefixWhitespaceChar, twitterHandle,
emailAddressMatch, urlMatch, protocolUrlMatch, wwwProtocolRelativeMatch,
tldProtocolRelativeMatch, phoneMatch, hashtagMatch,
hashtagPrefixWhitespaceChar, hashtag
) {
// Note: The `matchStr` variable wil be fixed up to remove characters that are no longer needed (which will
// be added to `prefixStr` and `suffixStr`).
var protocolRelativeMatch = wwwProtocolRelativeMatch || tldProtocolRelativeMatch,
match, // Will be an Autolinker.match.Match object
prefixStr = "", // A string to use to prefix the anchor tag that is created. This is needed for the Twitter and Hashtag matches.
suffixStr = ""; // A string to suffix the anchor tag that is created. This is used if there is a trailing parenthesis that should not be auto-linked.
// Return out with `null` for match types that are disabled (url, email,
// twitter, hashtag), or for matches that are invalid (false positives
// from the matcherRegex, which can't use look-behinds since they are
// unavailable in JS).
if(
( urlMatch && !this.urls ) ||
( emailAddressMatch && !this.email ) ||
( phoneMatch && !this.phone ) ||
( twitterMatch && !this.twitter ) ||
( hashtagMatch && !this.hashtag ) ||
!this.matchValidator.isValidMatch( urlMatch, protocolUrlMatch, protocolRelativeMatch )
) {
return null;
}
// Handle a closing parenthesis at the end of the match, and exclude it
// if there is not a matching open parenthesis
// in the match itself.
if( this.matchHasUnbalancedClosingParen( matchStr ) ) {
matchStr = matchStr.substr( 0, matchStr.length - 1 ); // remove the trailing ")"
suffixStr = ")"; // this will be added after the generated <a> tag
} else {
// Handle an invalid character after the TLD
var pos = this.matchHasInvalidCharAfterTld( urlMatch, protocolUrlMatch );
if( pos > -1 ) {
suffixStr = matchStr.substr(pos); // this will be added after the generated <a> tag
matchStr = matchStr.substr( 0, pos ); // remove the trailing invalid chars
}
}
if( emailAddressMatch ) {
match = new Autolinker.match.Email( { matchedText: matchStr, email: emailAddressMatch } );
} else if( twitterMatch ) {
// fix up the `matchStr` if there was a preceding whitespace char,
// which was needed to determine the match itself (since there are
// no look-behinds in JS regexes)
if( twitterHandlePrefixWhitespaceChar ) {
prefixStr = twitterHandlePrefixWhitespaceChar;
matchStr = matchStr.slice( 1 ); // remove the prefixed whitespace char from the match
}
match = new Autolinker.match.Twitter( { matchedText: matchStr, twitterHandle: twitterHandle } );
} else if( phoneMatch ) {
// remove non-numeric values from phone number string
var cleanNumber = matchStr.replace( /\D/g, '' );
match = new Autolinker.match.Phone( { matchedText: matchStr, number: cleanNumber } );
} else if( hashtagMatch ) {
// fix up the `matchStr` if there was a preceding whitespace char,
// which was needed to determine the match itself (since there are
// no look-behinds in JS regexes)
if( hashtagPrefixWhitespaceChar ) {
prefixStr = hashtagPrefixWhitespaceChar;
matchStr = matchStr.slice( 1 ); // remove the prefixed whitespace char from the match
}
match = new Autolinker.match.Hashtag( { matchedText: matchStr, serviceName: this.hashtag, hashtag: hashtag } );
} else { // url match
// If it's a protocol-relative '//' match, remove the character
// before the '//' (which the matcherRegex needed to match due to
// the lack of a negative look-behind in JavaScript regular
// expressions)
if( protocolRelativeMatch ) {
var charBeforeMatch = protocolRelativeMatch.match( this.charBeforeProtocolRelMatchRegex )[ 1 ] || "";
if( charBeforeMatch ) { // fix up the `matchStr` if there was a preceding char before a protocol-relative match, which was needed to determine the match itself (since there are no look-behinds in JS regexes)
prefixStr = charBeforeMatch;
matchStr = matchStr.slice( 1 ); // remove the prefixed char from the match
}
}
match = new Autolinker.match.Url( {
matchedText : matchStr,
url : matchStr,
protocolUrlMatch : !!protocolUrlMatch,
protocolRelativeMatch : !!protocolRelativeMatch,
stripPrefix : this.stripPrefix
} );
}
return {
prefixStr : prefixStr,
suffixStr : suffixStr,
match : match
};
},
/**
* Determines if a match found has an unmatched closing parenthesis. If so,
* this parenthesis will be removed from the match itself, and appended
* after the generated anchor tag in {@link #processCandidateMatch}.
*
* A match may have an extra closing parenthesis at the end of the match
* because the regular expression must include parenthesis for URLs such as
* "wikipedia.com/something_(disambiguation)", which should be auto-linked.
*
* However, an extra parenthesis *will* be included when the URL itself is
* wrapped in parenthesis, such as in the case of "(wikipedia.com/something_(disambiguation))".
* In this case, the last closing parenthesis should *not* be part of the
* URL itself, and this method will return `true`.
*
* @private
* @param {String} matchStr The full match string from the {@link #matcherRegex}.
* @return {Boolean} `true` if there is an unbalanced closing parenthesis at
* the end of the `matchStr`, `false` otherwise.
*/
matchHasUnbalancedClosingParen : function( matchStr ) {
var lastChar = matchStr.charAt( matchStr.length - 1 );
if( lastChar === ')' ) {
var openParensMatch = matchStr.match( /\(/g ),
closeParensMatch = matchStr.match( /\)/g ),
numOpenParens = ( openParensMatch && openParensMatch.length ) || 0,
numCloseParens = ( closeParensMatch && closeParensMatch.length ) || 0;
if( numOpenParens < numCloseParens ) {
return true;
}
}
return false;
},
/**
* Determine if there's an invalid character after the TLD in a URL. Valid
* characters after TLD are ':/?#'. Exclude protocol matched URLs from this
* check.
*
* @private
* @param {String} urlMatch The matched URL, if there was one. Will be an
* empty string if the match is not a URL match.
* @param {String} protocolUrlMatch The match URL string for a protocol
* match. Ex: 'http://yahoo.com'. This is used to match something like
* 'http://localhost', where we won't double check that the domain name
* has at least one '.' in it.
* @return {Number} the position where the invalid character was found. If
* no such character was found, returns -1
*/
matchHasInvalidCharAfterTld : function( urlMatch, protocolUrlMatch ) {
if ( !urlMatch ) {
return -1;
}
var offset = 0;
if ( protocolUrlMatch ) {
offset = urlMatch.indexOf(':');
urlMatch = urlMatch.slice(offset);
}
var re = /^((.?\/\/)?[A-Za-z0-9\.\-]*[A-Za-z0-9\-]\.[A-Za-z]+)/;
var res = re.exec( urlMatch );
if ( res === null ) {
return -1;
}
offset += res[1].length;
urlMatch = urlMatch.slice(res[1].length);
if (/^[^.A-Za-z:\/?#]/.test(urlMatch)) {
return offset;
}
return -1;
}
} );