]> git.mjollnir.org Git - moodle.git/commitdiff
"MDL-19180, url plugin, grab image resources only"
authordongsheng <dongsheng>
Wed, 1 Jul 2009 03:46:20 +0000 (03:46 +0000)
committerdongsheng <dongsheng>
Wed, 1 Jul 2009 03:46:20 +0000 (03:46 +0000)
repository/url/lib.php
repository/url/repository.class.php

index e96a06809ee44ccd68be61b56d7a5a7f9f0abb93..9d41127ab59f374052a1bf2c7b7370c83024b8cf 100644 (file)
@@ -431,4 +431,271 @@ function join_url( $parts, $encode=TRUE )
                $url .= '#' . $parts['fragment'];
        return $url;
 }
-?>
+/**
+ * Extract URLs from a web page.
+ *
+ * URLs are extracted from a long list of tags and attributes as defined
+ * by the HTML 2.0, HTML 3.2, HTML 4.01, and draft HTML 5.0 specifications.
+ * URLs are also extracted from tags and attributes that are common
+ * extensions of HTML, from the draft Forms 2.0 specification, from XHTML,
+ * and from WML 1.3 and 2.0.
+ *
+ * The function returns an associative array of associative arrays of
+ * arrays of URLs.  The outermost array's keys are the tag (element) name,
+ * such as "a" for <a> or "img" for <img>.  The values for these entries
+ * are associative arrays where the keys are attribute names for those
+ * tags, such as "href" for <a href="...">.  Finally, the values for
+ * those arrays are URLs found in those tags and attributes throughout
+ * the text.
+ *
+ * Parameters:
+ *     text            the UTF-8 text to scan
+ *
+ * Return values:
+ *     an associative array where keys are tags and values are an
+ *     associative array where keys are attributes and values are
+ *     an array of URLs.
+ *
+ * See:
+ *     http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page
+ */
+function extract_html_urls( $text )
+{
+       $match_elements = array(
+               // HTML
+               array('element'=>'a',           'attribute'=>'href'),           // 2.0
+               array('element'=>'a',           'attribute'=>'urn'),            // 2.0
+               array('element'=>'base',        'attribute'=>'href'),           // 2.0
+               array('element'=>'form',        'attribute'=>'action'),         // 2.0
+               array('element'=>'img',         'attribute'=>'src'),            // 2.0
+               array('element'=>'link',        'attribute'=>'href'),           // 2.0
+
+               array('element'=>'applet',      'attribute'=>'code'),           // 3.2
+               array('element'=>'applet',      'attribute'=>'codebase'),       // 3.2
+               array('element'=>'area',        'attribute'=>'href'),           // 3.2
+               array('element'=>'body',        'attribute'=>'background'),     // 3.2
+               array('element'=>'img',         'attribute'=>'usemap'),         // 3.2
+               array('element'=>'input',       'attribute'=>'src'),            // 3.2
+
+               array('element'=>'applet',      'attribute'=>'archive'),        // 4.01
+               array('element'=>'applet',      'attribute'=>'object'),         // 4.01
+               array('element'=>'blockquote',  'attribute'=>'cite'),           // 4.01
+               array('element'=>'del',         'attribute'=>'cite'),           // 4.01
+               array('element'=>'frame',       'attribute'=>'longdesc'),       // 4.01
+               array('element'=>'frame',       'attribute'=>'src'),            // 4.01
+               array('element'=>'head',        'attribute'=>'profile'),        // 4.01
+               array('element'=>'iframe',      'attribute'=>'longdesc'),       // 4.01
+               array('element'=>'iframe',      'attribute'=>'src'),            // 4.01
+               array('element'=>'img',         'attribute'=>'longdesc'),       // 4.01
+               array('element'=>'input',       'attribute'=>'usemap'),         // 4.01
+               array('element'=>'ins',         'attribute'=>'cite'),           // 4.01
+               array('element'=>'object',      'attribute'=>'archive'),        // 4.01
+               array('element'=>'object',      'attribute'=>'classid'),        // 4.01
+               array('element'=>'object',      'attribute'=>'codebase'),       // 4.01
+               array('element'=>'object',      'attribute'=>'data'),           // 4.01
+               array('element'=>'object',      'attribute'=>'usemap'),         // 4.01
+               array('element'=>'q',           'attribute'=>'cite'),           // 4.01
+               array('element'=>'script',      'attribute'=>'src'),            // 4.01
+
+               array('element'=>'audio',       'attribute'=>'src'),            // 5.0
+               array('element'=>'command',     'attribute'=>'icon'),           // 5.0
+               array('element'=>'embed',       'attribute'=>'src'),            // 5.0
+               array('element'=>'event-source','attribute'=>'src'),            // 5.0
+               array('element'=>'html',        'attribute'=>'manifest'),       // 5.0
+               array('element'=>'source',      'attribute'=>'src'),            // 5.0
+               array('element'=>'video',       'attribute'=>'src'),            // 5.0
+               array('element'=>'video',       'attribute'=>'poster'),         // 5.0
+
+               array('element'=>'bgsound',     'attribute'=>'src'),            // Extension
+               array('element'=>'body',        'attribute'=>'credits'),        // Extension
+               array('element'=>'body',        'attribute'=>'instructions'),   // Extension
+               array('element'=>'body',        'attribute'=>'logo'),           // Extension
+               array('element'=>'div',         'attribute'=>'href'),           // Extension
+               array('element'=>'div',         'attribute'=>'src'),            // Extension
+               array('element'=>'embed',       'attribute'=>'code'),           // Extension
+               array('element'=>'embed',       'attribute'=>'pluginspage'),    // Extension
+               array('element'=>'html',        'attribute'=>'background'),     // Extension
+               array('element'=>'ilayer',      'attribute'=>'src'),            // Extension
+               array('element'=>'img',         'attribute'=>'dynsrc'),         // Extension
+               array('element'=>'img',         'attribute'=>'lowsrc'),         // Extension
+               array('element'=>'input',       'attribute'=>'dynsrc'),         // Extension
+               array('element'=>'input',       'attribute'=>'lowsrc'),         // Extension
+               array('element'=>'table',       'attribute'=>'background'),     // Extension
+               array('element'=>'td',          'attribute'=>'background'),     // Extension
+               array('element'=>'th',          'attribute'=>'background'),     // Extension
+               array('element'=>'layer',       'attribute'=>'src'),            // Extension
+               array('element'=>'xml',         'attribute'=>'src'),            // Extension
+
+               array('element'=>'button',      'attribute'=>'action'),         // Forms 2.0
+               array('element'=>'datalist',    'attribute'=>'data'),           // Forms 2.0
+               array('element'=>'form',        'attribute'=>'data'),           // Forms 2.0
+               array('element'=>'input',       'attribute'=>'action'),         // Forms 2.0
+               array('element'=>'select',      'attribute'=>'data'),           // Forms 2.0
+
+               // XHTML
+               array('element'=>'html',        'attribute'=>'xmlns'),
+
+               // WML
+               array('element'=>'access',      'attribute'=>'path'),           // 1.3
+               array('element'=>'card',        'attribute'=>'onenterforward'), // 1.3
+               array('element'=>'card',        'attribute'=>'onenterbackward'),// 1.3
+               array('element'=>'card',        'attribute'=>'ontimer'),        // 1.3
+               array('element'=>'go',          'attribute'=>'href'),           // 1.3
+               array('element'=>'option',      'attribute'=>'onpick'),         // 1.3
+               array('element'=>'template',    'attribute'=>'onenterforward'), // 1.3
+               array('element'=>'template',    'attribute'=>'onenterbackward'),// 1.3
+               array('element'=>'template',    'attribute'=>'ontimer'),        // 1.3
+               array('element'=>'wml',         'attribute'=>'xmlns'),          // 2.0
+       );
+
+       $match_metas = array(
+               'content-base',
+               'content-location',
+               'referer',
+               'location',
+               'refresh',
+       );
+
+       // Extract all elements
+       if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) )
+               return array( );
+       $elements = $matches[1];
+       $value_pattern = '=(("([^"]*)")|([^\s]*))';
+
+       // Match elements and attributes
+       foreach ( $match_elements as $match_element )
+       {
+               $name = $match_element['element'];
+               $attr = $match_element['attribute'];
+               $pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu';
+               if ( $name == 'object' )
+                       $split_pattern = '/\s*/u';      // Space-separated URL list
+               else if ( $name == 'archive' )
+                       $split_pattern = '/,\s*/u';     // Comma-separated URL list
+               else
+                       unset( $split_pattern );        // Single URL
+               foreach ( $elements as $element )
+               {
+                       if ( !preg_match( $pattern, $element, $match ) )
+                               continue;
+                       $m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3];
+                       if ( !isset( $split_pattern ) )
+                               $urls[$name][$attr][] = $m;
+                       else
+                       {
+                               $msplit = preg_split( $split_pattern, $m );
+                               foreach ( $msplit as $ms )
+                                       $urls[$name][$attr][] = $ms;
+                       }
+               }
+       }
+
+       // Match meta http-equiv elements
+       foreach ( $match_metas as $match_meta )
+       {
+               $attr_pattern    = '/http-equiv="?' . $match_meta . '"?/iu';
+               $content_pattern = '/content'  . $value_pattern . '/iu';
+               $refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu';
+               foreach ( $elements as $element )
+               {
+                       if ( !preg_match( '/^meta/iu', $element ) ||
+                               !preg_match( $attr_pattern, $element ) ||
+                               !preg_match( $content_pattern, $element, $match ) )
+                               continue;
+                       $m = empty($match[3]) ? $match[4] : $match[3];
+                       if ( $match_meta != 'refresh' )
+                               $urls['meta']['http-equiv'][] = $m;
+                       else if ( preg_match( $refresh_pattern, $m, $match ) )
+                               $urls['meta']['http-equiv'][] = $match[2];
+               }
+       }
+
+       // Match style attributes
+       $urls['style'] = array( );
+       $style_pattern = '/style' . $value_pattern . '/iu';
+       foreach ( $elements as $element )
+       {
+               if ( !preg_match( $style_pattern, $element, $match ) )
+                       continue;
+               $m = empty($match[3]) ? $match[4] : $match[3];
+               $style_urls = extract_css_urls( $m );
+               if ( !empty( $style_urls ) )
+                       $urls['style'] = array_merge_recursive(
+                               $urls['style'], $style_urls );
+       }
+
+       // Match style bodies
+       if ( preg_match_all( '/<style[^>]*>(.*?)<\/style>/siu', $text, $style_bodies ) )
+       {
+               foreach ( $style_bodies[1] as $style_body )
+               {
+                       $style_urls = extract_css_urls( $style_body );
+                       if ( !empty( $style_urls ) )
+                               $urls['style'] = array_merge_recursive(
+                                       $urls['style'], $style_urls );
+               }
+       }
+       if ( empty($urls['style']) )
+               unset( $urls['style'] );
+
+       return $urls;
+}
+/**
+ * Extract URLs from UTF-8 CSS text.
+ *
+ * URLs within @import statements and url() property functions are extracted
+ * and returned in an associative array of arrays.  Array keys indicate
+ * the use context for the URL, including:
+ *
+ *     "import"
+ *     "property"
+ *
+ * Each value in the associative array is an array of URLs.
+ *
+ * Parameters:
+ *     text            the UTF-8 text to scan
+ *
+ * Return values:
+ *     an associative array of arrays of URLs.
+ *
+ * See:
+ *     http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file
+ */
+function extract_css_urls( $text )
+{
+       $urls = array( );
+
+       $url_pattern     = '(([^\\\\\'", \(\)]*(\\\\.)?)+)';
+       $urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)';
+       $pattern         = '/(' .
+                '(@import\s*[\'"]' . $url_pattern     . '[\'"])' .
+               '|(@import\s*'      . $urlfunc_pattern . ')'      .
+               '|('                . $urlfunc_pattern . ')'      .  ')/iu';
+       if ( !preg_match_all( $pattern, $text, $matches ) )
+               return $urls;
+
+       // @import '...'
+       // @import "..."
+       foreach ( $matches[3] as $match )
+               if ( !empty($match) )
+                       $urls['import'][] = 
+                               preg_replace( '/\\\\(.)/u', '\\1', $match );
+
+       // @import url(...)
+       // @import url('...')
+       // @import url("...")
+       foreach ( $matches[7] as $match )
+               if ( !empty($match) )
+                       $urls['import'][] = 
+                               preg_replace( '/\\\\(.)/u', '\\1', $match );
+
+       // url(...)
+       // url('...')
+       // url("...")
+       foreach ( $matches[11] as $match )
+               if ( !empty($match) )
+                       $urls['property'][] = 
+                               preg_replace( '/\\\\(.)/u', '\\1', $match );
+
+       return $urls;
+}
index 58f9183a9517f67a0b96bba5174af1b83a085a42..01d89a2e6cb20ea9e984c3ad422a93249880ef6f 100755 (executable)
@@ -111,16 +111,18 @@ EOD;
     public function analyse_page($baseurl, $content, &$list) {
         global $CFG, $OUTPUT;
         $OUTPUT->initialise_deprecated_cfg_pixpath();
-        $pattern = '#src="?\'?([[:alnum:]:?=&@/._+-]+)"?\'?#i';
-        $matches = null;
-        preg_match_all($pattern, $content, $matches);
-        $matches = array_unique($matches[1]);
-        if (!empty($matches)) {
-            foreach($matches as $url) {
+        $urls = extract_html_urls($content);
+        $images = $urls['img']['src'];
+        $pattern = '#img(.+)src="?\'?([[:alnum:]:?=&@/._+-]+)"?\'?#i';
+        if (!empty($images)) {
+            foreach($images as $url) {
                 $list['list'][] = array(
                     'title'=>$this->guess_filename($url, ''),
                     'source'=>url_to_absolute($baseurl, $url),
-                    'thumbnail' => $CFG->pixpath .'/f/'. mimeinfo('icon32', $url)
+                    'thumbnail'=>url_to_absolute($baseurl, $url),
+                    'thumbnail_height'=>84,
+                    'thumbnail_width'=>84
+                    //'thumbnail' => $CFG->pixpath .'/f/'. mimeinfo('icon32', $url)
                     );
             }
         }