From ded69477ba4dcfeba83d7aaad6048db64fb154d6 Mon Sep 17 00:00:00 2001
From: Florian Staudacher <florian_staudacher@yahoo.de>
Date: Wed, 10 Oct 2012 00:07:59 +0200
Subject: [PATCH] replace the monstrous link regex with a slightly less
 complicated one, based on the one from the markdown parser, fixes #3553
 adding a list of urls for testing the regex in the specs

---
 .../javascripts/app/helpers/text_formatter.js | 31 ++++++++----
 spec/fixtures/bad_urls.txt                    | 43 +++++++++++++++++
 spec/fixtures/good_urls.txt                   | 40 ++++++++++++++++
 .../app/helpers/text_formatter_spec.js        | 48 +++++++++++++++++--
 4 files changed, 151 insertions(+), 11 deletions(-)
 create mode 100644 spec/fixtures/bad_urls.txt
 create mode 100644 spec/fixtures/good_urls.txt

diff --git a/app/assets/javascripts/app/helpers/text_formatter.js b/app/assets/javascripts/app/helpers/text_formatter.js
index b33fb3b53..a4602b12f 100644
--- a/app/assets/javascripts/app/helpers/text_formatter.js
+++ b/app/assets/javascripts/app/helpers/text_formatter.js
@@ -1,3 +1,9 @@
+
+// cache url regex globally, for direct acces when testing
+$(function() {
+  Diaspora.url_regex = /(^|\s)\b((?:(?:https?|ftp):(?:\/{1,3})|www\.)(?:[^"<>\)\s]|\(([^\s()<>]+|(\([^\s()<>]+\)))\))+)(?=\s|$)/gi;
+});
+
 (function(){
   //make it so I take text and mentions rather than the modelapp.helpers.textFormatter(
   var textFormatter = function textFormatter(text, model) {
@@ -17,20 +23,29 @@
     converter.hooks.chain("preConversion", function(text) {
 
       // add < > around plain urls, effectively making them "autolinks"
-      // regex copied from: http://daringfireball.net/2010/07/improved_regex_for_matching_urls (slightly modified)
-      var urlRegex = /(^|\s)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))/gi;
-      text = text.replace(urlRegex, function(wholematch, space, url) {
-        if( url.match(/^[^\w]/) ) return wholematch; // evil witchcraft, noop
-        return space+"<"+url+">";
+      text = text.replace(Diaspora.url_regex, function() {
+        var url = arguments[2];
+        if( url.match(/^[^\w]/) ) return url; // evil witchcraft, noop
+        return arguments[1]+"<"+url+">";
       });
 
       // process links
       // regex copied from: https://code.google.com/p/pagedown/source/browse/Markdown.Converter.js#1198 (and slightly expanded)
-      var linkRegex = /(\[.*\]:\s)?(<|\()((?:(https?|ftp):\/\/[^\/'">\s]|www)[^'">\s]+?)(>|\))/gi;
+      var linkRegex = /(\[.*\]:\s)?(<|\()((?:(https?|ftp):\/\/[^\/'">\s]|www)[^'">\s]+?)([>\)]{1,2})/gi;
       text = text.replace(linkRegex, function() {
         var unicodeUrl = arguments[3];
+        var urlSuffix = arguments[5];
+
         unicodeUrl = ( unicodeUrl.match(/^www/) ) ? ('http://' + unicodeUrl) : unicodeUrl;
 
+        // handle parentheses, especially in case the link ends with ')'
+        if( urlSuffix.indexOf(')') != -1 && urlSuffix.indexOf('>') != -1 ) {
+          unicodeUrl += ')';
+          urlSuffix = '>';
+        }
+        // markdown doesn't like '(' or ')' anywhere, except where it wants
+        var workingUrl = unicodeUrl.replace(/\(/, "%28").replace(/\)/, "%29");
+
         var addr = parse_url(unicodeUrl);
         if( !addr.host ) addr.host = ""; // must not be 'undefined'
 
@@ -45,8 +60,8 @@
           (!addr.query ? '' : '?' + encodeURI(addr.query) ) +
           (!addr.fragment ? '' : '#' + encodeURI(addr.fragment) );
         if( !arguments[1] || arguments[1] == "") { // inline link
-          if(arguments[2] == "<") return "["+unicodeUrl+"]("+asciiUrl+")"; // without link text
-          else return arguments[2]+asciiUrl+arguments[5]; // with link text
+          if(arguments[2] == "<") return "["+workingUrl+"]("+asciiUrl+")"; // without link text
+          else return arguments[2]+asciiUrl+urlSuffix; // with link text
         } else { // reference style link
           return arguments[1]+asciiUrl;
         }
diff --git a/spec/fixtures/bad_urls.txt b/spec/fixtures/bad_urls.txt
new file mode 100644
index 000000000..61ccd7b96
--- /dev/null
+++ b/spec/fixtures/bad_urls.txt
@@ -0,0 +1,43 @@
+#
+# http://mathiasbynens.be/demo/url-regex
+# lines starting with '#' are ignored
+#
+http://
+http://.
+http://..
+http://../
+http://?
+http://??
+http://??/
+http://#
+http://##
+http://##/
+http://foo.bar?q=Spaces should be encoded
+//
+//a
+///a
+///
+http:///a
+foo.com
+rdar://1234
+h://test
+http:// shouldfail.com
+:// should fail
+http://foo.bar/foo(bar)baz quux
+ftps://foo.bar/
+http://-error-.invalid/
+http://a.b--c.de/
+http://-a.b.co
+http://a.b-.co
+http://0.0.0.0
+http://10.1.1.0
+http://10.1.1.255
+http://224.1.1.1
+http://1.1.1.1.1
+http://123.123.123
+http://3628126748
+http://.www.foo.bar/
+http://www.foo.bar./
+http://.www.foo.bar./
+http://10.1.1.1
+http://10.1.1.254
\ No newline at end of file
diff --git a/spec/fixtures/good_urls.txt b/spec/fixtures/good_urls.txt
new file mode 100644
index 000000000..2ef9ff98b
--- /dev/null
+++ b/spec/fixtures/good_urls.txt
@@ -0,0 +1,40 @@
+#
+# http://mathiasbynens.be/demo/url-regex
+# lines starting with '#' are ignored
+#
+http://foo.com/blah_blah
+http://foo.com/blah_blah/
+http://foo.com/blah_blah_(wikipedia)
+http://foo.com/blah_blah_(wikipedia)_(again)
+http://www.example.com/wpstyle/?p=364
+https://www.example.com/foo/?bar=baz&inga=42&quux
+http://✪df.ws/123
+http://userid:password@example.com:8080
+http://userid:password@example.com:8080/
+http://userid@example.com
+http://userid@example.com/
+http://userid@example.com:8080
+http://userid@example.com:8080/
+http://userid:password@example.com
+http://userid:password@example.com/
+http://142.42.1.1/
+http://142.42.1.1:8080/
+http://➡.ws/䨹
+http://⌘.ws
+http://⌘.ws/
+http://foo.com/blah_(wikipedia)#cite-1
+http://foo.com/blah_(wikipedia)_blah#cite-1
+http://foo.com/unicode_(✪)_in_parens
+http://foo.com/(something)?after=parens
+http://☺.damowmow.com/
+http://code.google.com/events/#&product=browser
+http://j.mp
+ftp://foo.bar/baz
+http://foo.bar/?q=Test%20URL-encoded%20stuff
+http://مثال.إختبار
+http://例子.测试
+http://उदाहरण.परीक्षा
+# http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com
+http://1337.net
+http://a.b-c.de
+http://223.255.255.254
\ No newline at end of file
diff --git a/spec/javascripts/app/helpers/text_formatter_spec.js b/spec/javascripts/app/helpers/text_formatter_spec.js
index d175e833e..51acb2816 100644
--- a/spec/javascripts/app/helpers/text_formatter_spec.js
+++ b/spec/javascripts/app/helpers/text_formatter_spec.js
@@ -41,7 +41,7 @@ describe("app.helpers.textFormatter", function(){
       //
       // var formattedText = this.formatter.markdownify(links.join(" "))
 
-      var formattedText = this.formatter.markdownify(links.join(" and "))
+      var formattedText = this.formatter.markdownify(links.join(" and "));
       var wrapper = $("<div>").html(formattedText);
 
       _.each(links, function(link) {
@@ -87,14 +87,16 @@ describe("app.helpers.textFormatter", function(){
           "http://bündnis-für-krankenhäuser.de/wp-content/uploads/2011/11/cropped-logohp.jpg",
           "http://موقع.وزارة-الاتصالات.مصر/", // example from #3082
           "http:///scholar.google.com/citations?view_op=top_venues",
-          "http://lyricstranslate.com/en/someone-you-നിന്നെ-പോലൊരാള്‍.html" // example from #3063
+          "http://lyricstranslate.com/en/someone-you-നിന്നെ-പോലൊരാള്‍.html", // example from #3063,
+          "http://de.wikipedia.org/wiki/Liste_der_Abkürzungen_(Netzjargon)" // #3645
         ];
         this.asciiUrls = [
           "http://www.xn--brgerentscheid-krankenhuser-xkc78d.de",
           "http://xn--bndnis-fr-krankenhuser-i5b27cha.de/wp-content/uploads/2011/11/cropped-logohp.jpg",
           "http://xn--4gbrim.xn----ymcbaaajlc6dj7bxne2c.xn--wgbh1c/",
           "http:///scholar.google.com/citations?view_op=top_venues",
-          "http://lyricstranslate.com/en/someone-you-%E0%B4%A8%E0%B4%BF%E0%B4%A8%E0%B5%8D%E0%B4%A8%E0%B5%86-%E0%B4%AA%E0%B5%8B%E0%B4%B2%E0%B5%8A%E0%B4%B0%E0%B4%BE%E0%B4%B3%E0%B5%8D%E2%80%8D.html"
+          "http://lyricstranslate.com/en/someone-you-%E0%B4%A8%E0%B4%BF%E0%B4%A8%E0%B5%8D%E0%B4%A8%E0%B5%86-%E0%B4%AA%E0%B5%8B%E0%B4%B2%E0%B5%8A%E0%B4%B0%E0%B4%BE%E0%B4%B3%E0%B5%8D%E2%80%8D.html",
+          "http://de.wikipedia.org/wiki/Liste_der_Abk%C3%BCrzungen_%28Netzjargon%29"
         ];
       });
 
@@ -141,6 +143,46 @@ describe("app.helpers.textFormatter", function(){
 
     });
 
+    context("misc breakage and/or other issues with weird urls", function(){
+      it("doesn't crash Chromium - RUN ME WITH CHROMIUM! (issue #3553)", function() {
+
+        var text_part = 'Revert "rails admin is conflicting with client side validations: see https://github.com/sferik/rails_admin/issues/985"';
+        var link_part = 'https://github.com/diaspora/diaspora/commit/61f40fc6bfe6bb859c995023b5a17d22c9b5e6e5';
+        var content = '['+text_part+']('+link_part+')';
+        var parsed = this.formatter.markdownify(content);
+
+        var link = 'href="' + link_part + '"';
+        var text = '>'+ text_part +'<';
+
+        expect(parsed).toContain(link);
+        expect(parsed).toContain(text);
+      });
+
+      it("tests a bunch of benchmark urls", function(){
+        var self = this;
+        $.ajax({
+          async: false,
+          cache: false,
+          url: '/spec/fixtures/good_urls.txt',
+          success: function(data) { self.url_list = data.split("\n"); }
+        });
+
+        _.each(this.url_list, function(url) {
+          // 'comments'
+          if( url.match(/^#/) ) return;
+
+          // regex.test is stupid, use match and boolean-ify it
+          var result = !!url.match(Diaspora.url_regex);
+          expect(result).toBeTruthy();
+          if( !result && console && console.log ) {
+            console.log(url);
+          }
+        });
+      });
+
+      // TODO: try to match the 'bad_urls.txt' and have as few matches as possible
+    });
+
   })
 
   describe(".hashtagify", function(){