From ded69477ba4dcfeba83d7aaad6048db64fb154d6 Mon Sep 17 00:00:00 2001 From: Florian Staudacher Date: Wed, 10 Oct 2012 00:07:59 +0200 Subject: [PATCH] replace the monstrous link regex with a slightly less complicated one, based on the one from the markdown parser, fixes #3553 adding a list of urls for testing the regex in the specs --- .../javascripts/app/helpers/text_formatter.js | 31 ++++++++---- spec/fixtures/bad_urls.txt | 43 +++++++++++++++++ spec/fixtures/good_urls.txt | 40 ++++++++++++++++ .../app/helpers/text_formatter_spec.js | 48 +++++++++++++++++-- 4 files changed, 151 insertions(+), 11 deletions(-) create mode 100644 spec/fixtures/bad_urls.txt create mode 100644 spec/fixtures/good_urls.txt diff --git a/app/assets/javascripts/app/helpers/text_formatter.js b/app/assets/javascripts/app/helpers/text_formatter.js index b33fb3b53..a4602b12f 100644 --- a/app/assets/javascripts/app/helpers/text_formatter.js +++ b/app/assets/javascripts/app/helpers/text_formatter.js @@ -1,3 +1,9 @@ + +// cache url regex globally, for direct acces when testing +$(function() { + Diaspora.url_regex = /(^|\s)\b((?:(?:https?|ftp):(?:\/{1,3})|www\.)(?:[^"<>\)\s]|\(([^\s()<>]+|(\([^\s()<>]+\)))\))+)(?=\s|$)/gi; +}); + (function(){ //make it so I take text and mentions rather than the modelapp.helpers.textFormatter( var textFormatter = function textFormatter(text, model) { @@ -17,20 +23,29 @@ converter.hooks.chain("preConversion", function(text) { // add < > around plain urls, effectively making them "autolinks" - // regex copied from: http://daringfireball.net/2010/07/improved_regex_for_matching_urls (slightly modified) - var urlRegex = /(^|\s)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))/gi; - text = text.replace(urlRegex, function(wholematch, space, url) { - if( url.match(/^[^\w]/) ) return wholematch; // evil witchcraft, noop - return space+"<"+url+">"; + text = text.replace(Diaspora.url_regex, function() { + var url = arguments[2]; + if( url.match(/^[^\w]/) ) return url; // evil witchcraft, noop + return arguments[1]+"<"+url+">"; }); // process links // regex copied from: https://code.google.com/p/pagedown/source/browse/Markdown.Converter.js#1198 (and slightly expanded) - var linkRegex = /(\[.*\]:\s)?(<|\()((?:(https?|ftp):\/\/[^\/'">\s]|www)[^'">\s]+?)(>|\))/gi; + var linkRegex = /(\[.*\]:\s)?(<|\()((?:(https?|ftp):\/\/[^\/'">\s]|www)[^'">\s]+?)([>\)]{1,2})/gi; text = text.replace(linkRegex, function() { var unicodeUrl = arguments[3]; + var urlSuffix = arguments[5]; + unicodeUrl = ( unicodeUrl.match(/^www/) ) ? ('http://' + unicodeUrl) : unicodeUrl; + // handle parentheses, especially in case the link ends with ')' + if( urlSuffix.indexOf(')') != -1 && urlSuffix.indexOf('>') != -1 ) { + unicodeUrl += ')'; + urlSuffix = '>'; + } + // markdown doesn't like '(' or ')' anywhere, except where it wants + var workingUrl = unicodeUrl.replace(/\(/, "%28").replace(/\)/, "%29"); + var addr = parse_url(unicodeUrl); if( !addr.host ) addr.host = ""; // must not be 'undefined' @@ -45,8 +60,8 @@ (!addr.query ? '' : '?' + encodeURI(addr.query) ) + (!addr.fragment ? '' : '#' + encodeURI(addr.fragment) ); if( !arguments[1] || arguments[1] == "") { // inline link - if(arguments[2] == "<") return "["+unicodeUrl+"]("+asciiUrl+")"; // without link text - else return arguments[2]+asciiUrl+arguments[5]; // with link text + if(arguments[2] == "<") return "["+workingUrl+"]("+asciiUrl+")"; // without link text + else return arguments[2]+asciiUrl+urlSuffix; // with link text } else { // reference style link return arguments[1]+asciiUrl; } diff --git a/spec/fixtures/bad_urls.txt b/spec/fixtures/bad_urls.txt new file mode 100644 index 000000000..61ccd7b96 --- /dev/null +++ b/spec/fixtures/bad_urls.txt @@ -0,0 +1,43 @@ +# +# http://mathiasbynens.be/demo/url-regex +# lines starting with '#' are ignored +# +http:// +http://. +http://.. +http://../ +http://? +http://?? +http://??/ +http://# +http://## +http://##/ +http://foo.bar?q=Spaces should be encoded +// +//a +///a +/// +http:///a +foo.com +rdar://1234 +h://test +http:// shouldfail.com +:// should fail +http://foo.bar/foo(bar)baz quux +ftps://foo.bar/ +http://-error-.invalid/ +http://a.b--c.de/ +http://-a.b.co +http://a.b-.co +http://0.0.0.0 +http://10.1.1.0 +http://10.1.1.255 +http://224.1.1.1 +http://1.1.1.1.1 +http://123.123.123 +http://3628126748 +http://.www.foo.bar/ +http://www.foo.bar./ +http://.www.foo.bar./ +http://10.1.1.1 +http://10.1.1.254 \ No newline at end of file diff --git a/spec/fixtures/good_urls.txt b/spec/fixtures/good_urls.txt new file mode 100644 index 000000000..2ef9ff98b --- /dev/null +++ b/spec/fixtures/good_urls.txt @@ -0,0 +1,40 @@ +# +# http://mathiasbynens.be/demo/url-regex +# lines starting with '#' are ignored +# +http://foo.com/blah_blah +http://foo.com/blah_blah/ +http://foo.com/blah_blah_(wikipedia) +http://foo.com/blah_blah_(wikipedia)_(again) +http://www.example.com/wpstyle/?p=364 +https://www.example.com/foo/?bar=baz&inga=42&quux +http://✪df.ws/123 +http://userid:password@example.com:8080 +http://userid:password@example.com:8080/ +http://userid@example.com +http://userid@example.com/ +http://userid@example.com:8080 +http://userid@example.com:8080/ +http://userid:password@example.com +http://userid:password@example.com/ +http://142.42.1.1/ +http://142.42.1.1:8080/ +http://➡.ws/䨹 +http://⌘.ws +http://⌘.ws/ +http://foo.com/blah_(wikipedia)#cite-1 +http://foo.com/blah_(wikipedia)_blah#cite-1 +http://foo.com/unicode_(✪)_in_parens +http://foo.com/(something)?after=parens +http://☺.damowmow.com/ +http://code.google.com/events/#&product=browser +http://j.mp +ftp://foo.bar/baz +http://foo.bar/?q=Test%20URL-encoded%20stuff +http://مثال.إختبار +http://例子.测试 +http://उदाहरण.परीक्षा +# http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com +http://1337.net +http://a.b-c.de +http://223.255.255.254 \ No newline at end of file diff --git a/spec/javascripts/app/helpers/text_formatter_spec.js b/spec/javascripts/app/helpers/text_formatter_spec.js index d175e833e..51acb2816 100644 --- a/spec/javascripts/app/helpers/text_formatter_spec.js +++ b/spec/javascripts/app/helpers/text_formatter_spec.js @@ -41,7 +41,7 @@ describe("app.helpers.textFormatter", function(){ // // var formattedText = this.formatter.markdownify(links.join(" ")) - var formattedText = this.formatter.markdownify(links.join(" and ")) + var formattedText = this.formatter.markdownify(links.join(" and ")); var wrapper = $("
").html(formattedText); _.each(links, function(link) { @@ -87,14 +87,16 @@ describe("app.helpers.textFormatter", function(){ "http://bündnis-für-krankenhäuser.de/wp-content/uploads/2011/11/cropped-logohp.jpg", "http://موقع.وزارة-الاتصالات.مصر/", // example from #3082 "http:///scholar.google.com/citations?view_op=top_venues", - "http://lyricstranslate.com/en/someone-you-നിന്നെ-പോലൊരാള്‍.html" // example from #3063 + "http://lyricstranslate.com/en/someone-you-നിന്നെ-പോലൊരാള്‍.html", // example from #3063, + "http://de.wikipedia.org/wiki/Liste_der_Abkürzungen_(Netzjargon)" // #3645 ]; this.asciiUrls = [ "http://www.xn--brgerentscheid-krankenhuser-xkc78d.de", "http://xn--bndnis-fr-krankenhuser-i5b27cha.de/wp-content/uploads/2011/11/cropped-logohp.jpg", "http://xn--4gbrim.xn----ymcbaaajlc6dj7bxne2c.xn--wgbh1c/", "http:///scholar.google.com/citations?view_op=top_venues", - "http://lyricstranslate.com/en/someone-you-%E0%B4%A8%E0%B4%BF%E0%B4%A8%E0%B5%8D%E0%B4%A8%E0%B5%86-%E0%B4%AA%E0%B5%8B%E0%B4%B2%E0%B5%8A%E0%B4%B0%E0%B4%BE%E0%B4%B3%E0%B5%8D%E2%80%8D.html" + "http://lyricstranslate.com/en/someone-you-%E0%B4%A8%E0%B4%BF%E0%B4%A8%E0%B5%8D%E0%B4%A8%E0%B5%86-%E0%B4%AA%E0%B5%8B%E0%B4%B2%E0%B5%8A%E0%B4%B0%E0%B4%BE%E0%B4%B3%E0%B5%8D%E2%80%8D.html", + "http://de.wikipedia.org/wiki/Liste_der_Abk%C3%BCrzungen_%28Netzjargon%29" ]; }); @@ -141,6 +143,46 @@ describe("app.helpers.textFormatter", function(){ }); + context("misc breakage and/or other issues with weird urls", function(){ + it("doesn't crash Chromium - RUN ME WITH CHROMIUM! (issue #3553)", function() { + + var text_part = 'Revert "rails admin is conflicting with client side validations: see https://github.com/sferik/rails_admin/issues/985"'; + var link_part = 'https://github.com/diaspora/diaspora/commit/61f40fc6bfe6bb859c995023b5a17d22c9b5e6e5'; + var content = '['+text_part+']('+link_part+')'; + var parsed = this.formatter.markdownify(content); + + var link = 'href="' + link_part + '"'; + var text = '>'+ text_part +'<'; + + expect(parsed).toContain(link); + expect(parsed).toContain(text); + }); + + it("tests a bunch of benchmark urls", function(){ + var self = this; + $.ajax({ + async: false, + cache: false, + url: '/spec/fixtures/good_urls.txt', + success: function(data) { self.url_list = data.split("\n"); } + }); + + _.each(this.url_list, function(url) { + // 'comments' + if( url.match(/^#/) ) return; + + // regex.test is stupid, use match and boolean-ify it + var result = !!url.match(Diaspora.url_regex); + expect(result).toBeTruthy(); + if( !result && console && console.log ) { + console.log(url); + } + }); + }); + + // TODO: try to match the 'bad_urls.txt' and have as few matches as possible + }); + }) describe(".hashtagify", function(){