From: Andrew Davis Date: Thu, 24 Dec 2009 06:44:51 +0000 (+0000) Subject: weblib MDL-21168 improved convert_urls_into_links X-Git-Url: http://git.mjollnir.org/gw?a=commitdiff_plain;h=c013b31695eb383172bff19d8db167c35dceeebe;p=moodle.git weblib MDL-21168 improved convert_urls_into_links --- diff --git a/lib/simpletest/testweblib.php b/lib/simpletest/testweblib.php index 64dbd435e8..1f26c1e655 100644 --- a/lib/simpletest/testweblib.php +++ b/lib/simpletest/testweblib.php @@ -166,49 +166,88 @@ END; } function test_convert_urls_into_links() { - $texts = array ( +$texts = array ( + //just a url + 'http://moodle.org - URL' => 'http://moodle.org - URL', + 'www.moodle.org - URL' => 'www.moodle.org - URL', + //url with params 'URL: http://moodle.org/s/i=1&j=2' => 'URL: http://moodle.org/s/i=1&j=2', + //url with escaped params 'URL: www.moodle.org/s/i=1&j=2' => 'URL: www.moodle.org/s/i=1&j=2', + //https url with params 'URL: https://moodle.org/s/i=1&j=2' => 'URL: https://moodle.org/s/i=1&j=2', + //url with port and params 'URL: http://moodle.org:8080/s/i=1' => 'URL: http://moodle.org:8080/s/i=1', - 'http://moodle.org - URL' => 'http://moodle.org - URL', - 'www.moodle.org - URL' => 'www.moodle.org - URL', + //url in brackets '(http://moodle.org) - URL' => '(http://moodle.org) - URL', '(www.moodle.org) - URL' => '(www.moodle.org) - URL', + //url in square brackets '[http://moodle.org] - URL' => '[http://moodle.org] - URL', '[www.moodle.org] - URL' => '[www.moodle.org] - URL', + //url in brackets with anchor '[http://moodle.org/main#anchor] - URL' => '[http://moodle.org/main#anchor] - URL', '[www.moodle.org/main#anchor] - URL' => '[www.moodle.org/main#anchor] - URL', + //brackets within the url 'URL: http://cc.org/url_(withpar)_go/?i=2' => 'URL: http://cc.org/url_(withpar)_go/?i=2', 'URL: www.cc.org/url_(withpar)_go/?i=2' => 'URL: www.cc.org/url_(withpar)_go/?i=2', 'URL: http://cc.org/url_(with)_(par)_go/?i=2' => 'URL: http://cc.org/url_(with)_(par)_go/?i=2', 'URL: www.cc.org/url_(with)_(par)_go/?i=2' => 'URL: www.cc.org/url_(with)_(par)_go/?i=2', + 'http://en.wikipedia.org/wiki/Slash_(punctuation)'=>'http://en.wikipedia.org/wiki/Slash_(punctuation)', + 'http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29 - URL' => 'http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29 - URL', + 'http://en.wikipedia.org/wiki/(#Parentheses_.28_.29 - URL' => 'http://en.wikipedia.org/wiki/(#Parentheses_.28_.29 - URL', + //escaped brackets in url + 'http://en.wikipedia.org/wiki/Slash_%28punctuation%29'=>'http://en.wikipedia.org/wiki/Slash_%28punctuation%29', + //anchor tag 'URL: http://moodle.org' => 'URL: http://moodle.org', 'URL: www.moodle.org' => 'URL: www.moodle.org', 'URL: http://moodle.org' => 'URL: http://moodle.org', 'URL: www.moodle.org' => 'URL: www.moodle.org', + //escaped anchor tag + htmlspecialchars('escaped anchor tag www.moodle.org') => 'escaped anchor tag <a href="http://moodle.org"> www.moodle.org</a>', + //trailing fullstop 'URL: http://moodle.org/s/i=1&j=2.' => 'URL: http://moodle.org/s/i=1&j=2.', 'URL: www.moodle.org/s/i=1&j=2.' => 'URL: www.moodle.org/s/i=1&j=2.', + //trailing unmatched bracket 'URL: http://moodle.org)
' => 'URL: http://moodle.org)
', + //partially escaped html 'URL:

text www.moodle.org</p> text' => 'URL:

text www.moodle.org</p> text', + //decimal url parameter 'URL: www.moodle.org?u=1.23' => 'URL: www.moodle.org?u=1.23', + //escaped space in url 'URL: www.moodle.org?u=test+param&' => 'URL: www.moodle.org?u=test+param&', + //odd characters in url param 'URL: www.moodle.org?param=:)' => 'URL: www.moodle.org?param=:)', + //multiple urls 'URL: http://moodle.org www.moodle.org' => 'URL: http://moodle.org www.moodle.org', + //containing anchor tags including a class parameter and a url to convert 'URL: http://moodle.org www.moodle.org http://moodle.org' => 'URL: http://moodle.org www.moodle.org http://moodle.org', + //subdomain 'http://subdomain.moodle.org - URL' => 'http://subdomain.moodle.org - URL', + //multiple subdomains 'http://subdomain.subdomain.moodle.org - URL' => 'http://subdomain.subdomain.moodle.org - URL', + //looks almost like a link but isnt 'This contains http, http:// and www but no actual links.'=>'This contains http, http:// and www but no actual links.', + //no link at all 'This is a story about moodle.coming to a cinema near you.'=>'This is a story about moodle.coming to a cinema near you.', - 'http://en.wikipedia.org/wiki/Slash_%28punctuation%29'=>'http://en.wikipedia.org/wiki/Slash_%28punctuation%29', - 'http://en.wikipedia.org/wiki/Slash_(punctuation)'=>'http://en.wikipedia.org/wiki/Slash_(punctuation)', - 'http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29 - URL' => 'http://en.wikipedia.org/wiki/%28#Parentheses_.28_.29 - URL', - 'http://en.wikipedia.org/wiki/(#Parentheses_.28_.29 - URL' => 'http://en.wikipedia.org/wiki/(#Parentheses_.28_.29 - URL', + //utf 8 characters 'http://Iñtërnâtiônàlizætiøn.com?ô=nëø'=>'http://Iñtërnâtiônàlizætiøn.com?ô=nëø', 'www.Iñtërnâtiônàlizætiøn.com?ô=nëø'=>'www.Iñtërnâtiônàlizætiøn.com?ô=nëø', - 'moodle.org' => 'moodle.org',//too hard to identify without additional regexs + //too hard to identify without additional regexs + 'moodle.org' => 'moodle.org', + //some text with no link between related html tags + 'no link here' => 'no link here', + //some text with a link between related html tags + 'a link here www.moodle.org' => 'a link here www.moodle.org', + //some text containing a link within unrelated tags + '
This is some text. www.moodle.com then some more text
' => '
This is some text. www.moodle.com then some more text
', + //check we aren't modifying img tags + 'image' => 'image', + //partially escaped img tag + 'partially escaped img tag <img src="http://moodle.org/logo/logo-240x60.gif" />' => 'partially escaped img tag <img src="http://moodle.org/logo/logo-240x60.gif" />', + //fully escaped img tag + htmlspecialchars('fully escaped img tag ') => 'fully escaped img tag <img src="http://moodle.org/logo/logo-240x60.gif" />', ); foreach ($texts as $text => $correctresult) { if(mb_detect_encoding($text)=='UTF-8') { @@ -232,6 +271,7 @@ END; $this->assertEqual($text, $correctresult, $msg); } + //performance testing $reps = 1000; $time_start = microtime(true); diff --git a/lib/weblib.php b/lib/weblib.php index 3df5258c44..b99b5e1413 100644 --- a/lib/weblib.php +++ b/lib/weblib.php @@ -1720,10 +1720,16 @@ function html_to_text($html) { * @param string $text Passed in by reference. The string to be searched for urls. */ function convert_urls_into_links(&$text) { - $filterignoretagsopen = array(']+?>'); - $filterignoretagsclose = array(''); + //I've added img tags to this list of tags to ignore. + //See MDL-21168 for more info. A better way to ignore tags whether or not + //they are escaped partially or completely would be desirable. For example: + // + //<a href="blah"> + //<a href="blah"> + $filterignoretagsopen = array(']+?>', ']+?>'); + $filterignoretagsclose = array('',''); filter_save_ignore_tags($text,$filterignoretagsopen,$filterignoretagsclose,$ignoretags); - + // Check if we support unicode modifiers in regular expressions. Cache it. // TODO: this check should be a environment requirement in Moodle 2.0, as far as unicode // chars are going to arrive to URLs officially really soon (2010?)