From: dongsheng Date: Wed, 1 Jul 2009 03:46:20 +0000 (+0000) Subject: "MDL-19180, url plugin, grab image resources only" X-Git-Url: http://git.mjollnir.org/gw?a=commitdiff_plain;h=47693061362b2c11428a76228a95781e71ea0b88;p=moodle.git "MDL-19180, url plugin, grab image resources only" --- diff --git a/repository/url/lib.php b/repository/url/lib.php index e96a06809e..9d41127ab5 100644 --- a/repository/url/lib.php +++ b/repository/url/lib.php @@ -431,4 +431,271 @@ function join_url( $parts, $encode=TRUE ) $url .= '#' . $parts['fragment']; return $url; } -?> +/** + * Extract URLs from a web page. + * + * URLs are extracted from a long list of tags and attributes as defined + * by the HTML 2.0, HTML 3.2, HTML 4.01, and draft HTML 5.0 specifications. + * URLs are also extracted from tags and attributes that are common + * extensions of HTML, from the draft Forms 2.0 specification, from XHTML, + * and from WML 1.3 and 2.0. + * + * The function returns an associative array of associative arrays of + * arrays of URLs. The outermost array's keys are the tag (element) name, + * such as "a" for or "img" for . The values for these entries + * are associative arrays where the keys are attribute names for those + * tags, such as "href" for . Finally, the values for + * those arrays are URLs found in those tags and attributes throughout + * the text. + * + * Parameters: + * text the UTF-8 text to scan + * + * Return values: + * an associative array where keys are tags and values are an + * associative array where keys are attributes and values are + * an array of URLs. + * + * See: + * http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page + */ +function extract_html_urls( $text ) +{ + $match_elements = array( + // HTML + array('element'=>'a', 'attribute'=>'href'), // 2.0 + array('element'=>'a', 'attribute'=>'urn'), // 2.0 + array('element'=>'base', 'attribute'=>'href'), // 2.0 + array('element'=>'form', 'attribute'=>'action'), // 2.0 + array('element'=>'img', 'attribute'=>'src'), // 2.0 + array('element'=>'link', 'attribute'=>'href'), // 2.0 + + array('element'=>'applet', 'attribute'=>'code'), // 3.2 + array('element'=>'applet', 'attribute'=>'codebase'), // 3.2 + array('element'=>'area', 'attribute'=>'href'), // 3.2 + array('element'=>'body', 'attribute'=>'background'), // 3.2 + array('element'=>'img', 'attribute'=>'usemap'), // 3.2 + array('element'=>'input', 'attribute'=>'src'), // 3.2 + + array('element'=>'applet', 'attribute'=>'archive'), // 4.01 + array('element'=>'applet', 'attribute'=>'object'), // 4.01 + array('element'=>'blockquote', 'attribute'=>'cite'), // 4.01 + array('element'=>'del', 'attribute'=>'cite'), // 4.01 + array('element'=>'frame', 'attribute'=>'longdesc'), // 4.01 + array('element'=>'frame', 'attribute'=>'src'), // 4.01 + array('element'=>'head', 'attribute'=>'profile'), // 4.01 + array('element'=>'iframe', 'attribute'=>'longdesc'), // 4.01 + array('element'=>'iframe', 'attribute'=>'src'), // 4.01 + array('element'=>'img', 'attribute'=>'longdesc'), // 4.01 + array('element'=>'input', 'attribute'=>'usemap'), // 4.01 + array('element'=>'ins', 'attribute'=>'cite'), // 4.01 + array('element'=>'object', 'attribute'=>'archive'), // 4.01 + array('element'=>'object', 'attribute'=>'classid'), // 4.01 + array('element'=>'object', 'attribute'=>'codebase'), // 4.01 + array('element'=>'object', 'attribute'=>'data'), // 4.01 + array('element'=>'object', 'attribute'=>'usemap'), // 4.01 + array('element'=>'q', 'attribute'=>'cite'), // 4.01 + array('element'=>'script', 'attribute'=>'src'), // 4.01 + + array('element'=>'audio', 'attribute'=>'src'), // 5.0 + array('element'=>'command', 'attribute'=>'icon'), // 5.0 + array('element'=>'embed', 'attribute'=>'src'), // 5.0 + array('element'=>'event-source','attribute'=>'src'), // 5.0 + array('element'=>'html', 'attribute'=>'manifest'), // 5.0 + array('element'=>'source', 'attribute'=>'src'), // 5.0 + array('element'=>'video', 'attribute'=>'src'), // 5.0 + array('element'=>'video', 'attribute'=>'poster'), // 5.0 + + array('element'=>'bgsound', 'attribute'=>'src'), // Extension + array('element'=>'body', 'attribute'=>'credits'), // Extension + array('element'=>'body', 'attribute'=>'instructions'), // Extension + array('element'=>'body', 'attribute'=>'logo'), // Extension + array('element'=>'div', 'attribute'=>'href'), // Extension + array('element'=>'div', 'attribute'=>'src'), // Extension + array('element'=>'embed', 'attribute'=>'code'), // Extension + array('element'=>'embed', 'attribute'=>'pluginspage'), // Extension + array('element'=>'html', 'attribute'=>'background'), // Extension + array('element'=>'ilayer', 'attribute'=>'src'), // Extension + array('element'=>'img', 'attribute'=>'dynsrc'), // Extension + array('element'=>'img', 'attribute'=>'lowsrc'), // Extension + array('element'=>'input', 'attribute'=>'dynsrc'), // Extension + array('element'=>'input', 'attribute'=>'lowsrc'), // Extension + array('element'=>'table', 'attribute'=>'background'), // Extension + array('element'=>'td', 'attribute'=>'background'), // Extension + array('element'=>'th', 'attribute'=>'background'), // Extension + array('element'=>'layer', 'attribute'=>'src'), // Extension + array('element'=>'xml', 'attribute'=>'src'), // Extension + + array('element'=>'button', 'attribute'=>'action'), // Forms 2.0 + array('element'=>'datalist', 'attribute'=>'data'), // Forms 2.0 + array('element'=>'form', 'attribute'=>'data'), // Forms 2.0 + array('element'=>'input', 'attribute'=>'action'), // Forms 2.0 + array('element'=>'select', 'attribute'=>'data'), // Forms 2.0 + + // XHTML + array('element'=>'html', 'attribute'=>'xmlns'), + + // WML + array('element'=>'access', 'attribute'=>'path'), // 1.3 + array('element'=>'card', 'attribute'=>'onenterforward'), // 1.3 + array('element'=>'card', 'attribute'=>'onenterbackward'),// 1.3 + array('element'=>'card', 'attribute'=>'ontimer'), // 1.3 + array('element'=>'go', 'attribute'=>'href'), // 1.3 + array('element'=>'option', 'attribute'=>'onpick'), // 1.3 + array('element'=>'template', 'attribute'=>'onenterforward'), // 1.3 + array('element'=>'template', 'attribute'=>'onenterbackward'),// 1.3 + array('element'=>'template', 'attribute'=>'ontimer'), // 1.3 + array('element'=>'wml', 'attribute'=>'xmlns'), // 2.0 + ); + + $match_metas = array( + 'content-base', + 'content-location', + 'referer', + 'location', + 'refresh', + ); + + // Extract all elements + if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) ) + return array( ); + $elements = $matches[1]; + $value_pattern = '=(("([^"]*)")|([^\s]*))'; + + // Match elements and attributes + foreach ( $match_elements as $match_element ) + { + $name = $match_element['element']; + $attr = $match_element['attribute']; + $pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu'; + if ( $name == 'object' ) + $split_pattern = '/\s*/u'; // Space-separated URL list + else if ( $name == 'archive' ) + $split_pattern = '/,\s*/u'; // Comma-separated URL list + else + unset( $split_pattern ); // Single URL + foreach ( $elements as $element ) + { + if ( !preg_match( $pattern, $element, $match ) ) + continue; + $m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3]; + if ( !isset( $split_pattern ) ) + $urls[$name][$attr][] = $m; + else + { + $msplit = preg_split( $split_pattern, $m ); + foreach ( $msplit as $ms ) + $urls[$name][$attr][] = $ms; + } + } + } + + // Match meta http-equiv elements + foreach ( $match_metas as $match_meta ) + { + $attr_pattern = '/http-equiv="?' . $match_meta . '"?/iu'; + $content_pattern = '/content' . $value_pattern . '/iu'; + $refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu'; + foreach ( $elements as $element ) + { + if ( !preg_match( '/^meta/iu', $element ) || + !preg_match( $attr_pattern, $element ) || + !preg_match( $content_pattern, $element, $match ) ) + continue; + $m = empty($match[3]) ? $match[4] : $match[3]; + if ( $match_meta != 'refresh' ) + $urls['meta']['http-equiv'][] = $m; + else if ( preg_match( $refresh_pattern, $m, $match ) ) + $urls['meta']['http-equiv'][] = $match[2]; + } + } + + // Match style attributes + $urls['style'] = array( ); + $style_pattern = '/style' . $value_pattern . '/iu'; + foreach ( $elements as $element ) + { + if ( !preg_match( $style_pattern, $element, $match ) ) + continue; + $m = empty($match[3]) ? $match[4] : $match[3]; + $style_urls = extract_css_urls( $m ); + if ( !empty( $style_urls ) ) + $urls['style'] = array_merge_recursive( + $urls['style'], $style_urls ); + } + + // Match style bodies + if ( preg_match_all( '/]*>(.*?)<\/style>/siu', $text, $style_bodies ) ) + { + foreach ( $style_bodies[1] as $style_body ) + { + $style_urls = extract_css_urls( $style_body ); + if ( !empty( $style_urls ) ) + $urls['style'] = array_merge_recursive( + $urls['style'], $style_urls ); + } + } + if ( empty($urls['style']) ) + unset( $urls['style'] ); + + return $urls; +} +/** + * Extract URLs from UTF-8 CSS text. + * + * URLs within @import statements and url() property functions are extracted + * and returned in an associative array of arrays. Array keys indicate + * the use context for the URL, including: + * + * "import" + * "property" + * + * Each value in the associative array is an array of URLs. + * + * Parameters: + * text the UTF-8 text to scan + * + * Return values: + * an associative array of arrays of URLs. + * + * See: + * http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file + */ +function extract_css_urls( $text ) +{ + $urls = array( ); + + $url_pattern = '(([^\\\\\'", \(\)]*(\\\\.)?)+)'; + $urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)'; + $pattern = '/(' . + '(@import\s*[\'"]' . $url_pattern . '[\'"])' . + '|(@import\s*' . $urlfunc_pattern . ')' . + '|(' . $urlfunc_pattern . ')' . ')/iu'; + if ( !preg_match_all( $pattern, $text, $matches ) ) + return $urls; + + // @import '...' + // @import "..." + foreach ( $matches[3] as $match ) + if ( !empty($match) ) + $urls['import'][] = + preg_replace( '/\\\\(.)/u', '\\1', $match ); + + // @import url(...) + // @import url('...') + // @import url("...") + foreach ( $matches[7] as $match ) + if ( !empty($match) ) + $urls['import'][] = + preg_replace( '/\\\\(.)/u', '\\1', $match ); + + // url(...) + // url('...') + // url("...") + foreach ( $matches[11] as $match ) + if ( !empty($match) ) + $urls['property'][] = + preg_replace( '/\\\\(.)/u', '\\1', $match ); + + return $urls; +} diff --git a/repository/url/repository.class.php b/repository/url/repository.class.php index 58f9183a95..01d89a2e6c 100755 --- a/repository/url/repository.class.php +++ b/repository/url/repository.class.php @@ -111,16 +111,18 @@ EOD; public function analyse_page($baseurl, $content, &$list) { global $CFG, $OUTPUT; $OUTPUT->initialise_deprecated_cfg_pixpath(); - $pattern = '#src="?\'?([[:alnum:]:?=&@/._+-]+)"?\'?#i'; - $matches = null; - preg_match_all($pattern, $content, $matches); - $matches = array_unique($matches[1]); - if (!empty($matches)) { - foreach($matches as $url) { + $urls = extract_html_urls($content); + $images = $urls['img']['src']; + $pattern = '#img(.+)src="?\'?([[:alnum:]:?=&@/._+-]+)"?\'?#i'; + if (!empty($images)) { + foreach($images as $url) { $list['list'][] = array( 'title'=>$this->guess_filename($url, ''), 'source'=>url_to_absolute($baseurl, $url), - 'thumbnail' => $CFG->pixpath .'/f/'. mimeinfo('icon32', $url) + 'thumbnail'=>url_to_absolute($baseurl, $url), + 'thumbnail_height'=>84, + 'thumbnail_width'=>84 + //'thumbnail' => $CFG->pixpath .'/f/'. mimeinfo('icon32', $url) ); } }