MDL-18468 restore preprocessing (split) - Added $CFG->experimentalsplitrestore option

author stronk7 <stronk7>

Mon, 9 Mar 2009 23:36:00 +0000 (23:36 +0000)

committer stronk7 <stronk7>

Mon, 9 Mar 2009 23:36:00 +0000 (23:36 +0000)
author stronk7 <stronk7>
Mon, 9 Mar 2009 23:36:00 +0000 (23:36 +0000)
committer stronk7 <stronk7>
Mon, 9 Mar 2009 23:36:00 +0000 (23:36 +0000)
diff --git a/backup/restorelib.php b/backup/restorelib.php

index 9b819662386c1cc147f3dc234b82a95e7c9cab34..37499037324607b42d7383c20bd70ba0ac675a6f 100644 (file)
--- a/backup/restorelib.php
+++ b/backup/restorelib.php
@@ -4581,6 +4581,217 @@ define('RESTORE_GROUPS_GROUPINGS', 3);
      //==                                                                                 ==
      //=====================================================================================
  
+    /// This is the class used to split, in first instance, the monolithic moodle.xml into
+    /// smaller xml files allowing the MoodleParser later to process only the required info
+    /// based in each TODO, instead of processing the whole xml for each TODO. In theory
+    /// processing time can be reduced upto 1/20th of original time (depending of the
+    /// number of TODOs in the original moodle.xml file)
+    ///
+    /// Anyway, note it's a general splitter parser, and only needs to be instantiated
+    /// with the proper destination dir and the tosplit configuration. Be careful when
+    /// using it because it doesn't support XML attributes nor real cdata out from tags.
+    /// (both not used in the target Moodle backup files)
+
+    class moodle_splitter_parser {
+        var $level = 0;            /// Level we are
+        var $tree = array();       /// Array of levels we are
+        var $cdata = '';           /// Raw storage for character data
+        var $content = '';         /// Content buffer to be printed to file
+        var $trailing= '';         /// Content of the trailing tree for each splited file
+        var $savepath = null;      /// Path to store splited files
+        var $fhandler = null;      /// Current file we are writing to
+        var $tosplit = array();    /// Array defining the files we want to split, in this format:
+                                   /// array( level/tag/level/tag => filename)
+        var $splitwords = array(); /// Denormalised array containing the potential tags
+                                   /// being a split point. To speed up check_split_point()
+        var $maxsplitlevel = 0;    /// Precalculated max level where any split happens. To speed up check_split_point()
+        var $buffersize = 65536;   /// 64KB is a good write buffer. Don't expect big benefits by increasing this.
+        var $repectformat = false; /// With this setting enabled, the splited files will look like the original one
+                                   /// with all the indentations 100% copied from original (character data outer tags).
+                                   /// But this is a waste of time from our perspective, and splited xml files are completely
+                                   /// functional without that, so we disable this for production, generating a more compact
+                                   /// XML quicker
+
+    /// PHP4 constructor
+        function moodle_splitter_parser($savepath, $tosplit = null) {
+            return $this->__construct($savepath, $tosplit);
+        }
+
+    /// PHP5 constructor
+        function __construct($savepath, $tosplit = null) {
+            $this->savepath = $savepath;
+            if (!empty($tosplit)) {
+                $this->tosplit = $tosplit;
+            } else { /// No tosplit list passed, process all the possible parts in one moodle.xml file
+                $this->tosplit = array(
+                                     '1/MOODLE_BACKUP/2/INFO'        => 'split_info.xml',
+                                     '1/MOODLE_BACKUP/2/ROLES'       => 'split_roles.xml',
+                                     '2/COURSE/3/HEADER'             => 'split_course_header.xml',
+                                     '2/COURSE/3/BLOCKS'             => 'split_blocks.xml',
+                                     '2/COURSE/3/SECTIONS'           => 'split_sections.xml',
+                                     '2/COURSE/3/FORMATDATA'         => 'split_formatdata.xml',
+                                     '2/COURSE/3/METACOURSE'         => 'split_metacourse.xml',
+                                     '2/COURSE/3/GRADEBOOK'          => 'split_gradebook.xml',
+                                     '2/COURSE/3/USERS'              => 'split_users.xml',
+                                     '2/COURSE/3/MESSAGES'           => 'split_messages.xml',
+                                     '2/COURSE/3/BLOGS'              => 'split_blogs.xml',
+                                     '2/COURSE/3/QUESTION_CATEGORIES'=> 'split_questions.xml',
+                                     '2/COURSE/3/SCALES'             => 'split_scales.xml',
+                                     '2/COURSE/3/GROUPS'             => 'split_groups.xml',
+                                     '2/COURSE/3/GROUPINGS'          => 'split_groupings.xml',
+                                     '2/COURSE/3/GROUPINGSGROUPS'    => 'split_groupingsgroups.xml',
+                                     '2/COURSE/3/EVENTS'             => 'split_events.xml',
+                                     '2/COURSE/3/MODULES'            => 'split_modules.xml',
+                                     '2/COURSE/3/LOGS'               => 'split_logs.xml'
+                                 );
+            }
+        /// Precalculate some info used to speedup checks
+            foreach ($this->tosplit as $key=>$value) {
+                $this->splitwords[basename($key)] = true;
+                if (((int) basename(dirname($key))) > $this->maxsplitlevel) {
+                    $this->maxsplitlevel = (int) basename(dirname($key));
+                }
+            }
+        }
+
+        /// Given one tag being opened, check if it's one split point.
+        /// Return false or split filename
+        function check_split_point($tag) {
+        /// Quick check. Level < 2 cannot be a split point
+            if ($this->level < 2) {
+                return false;
+            }
+        /// Quick check. Current tag against potential splitwords
+            if (!isset($this->splitwords[$tag])) {
+                return false;
+            }
+        /// Prev test passed, take a look to 2-level tosplit
+            $keytocheck = ($this->level - 1) . '/' . $this->tree[$this->level - 1] . '/' . $this->level . '/' . $this->tree[$this->level];
+            if (!isset($this->tosplit[$keytocheck])) {
+                return false;
+            }
+        /// Prev test passed, we are in a split point, return new filename
+            return $this->tosplit[$keytocheck];
+        }
+
+        /// To append data (xml-escaped) to contents buffer
+        function character_data($parser, $data) {
+
+            ///$this->content .= preg_replace($this->entity_find, $this->entity_replace, $data); ///40% slower
+            ///$this->content .= str_replace($this->entity_find, $this->entity_replace, $data);  ///25% slower
+            ///$this->content .= htmlspecialchars($data);                                        ///the best
+            /// Instead of htmlspecialchars() each chunk of character data, we are going to
+            /// concat it without transformation and will apply the htmlspecialchars() when
+            /// that character data is, efectively, going to be added to contents buffer. This
+            /// makes the number of transformations to be reduced (speedup) and avoid potential
+            /// problems with transformations being applied "in the middle" of multibyte chars.
+            $this->cdata .= $data;
+        }
+
+        /// To detect start of tags, keeping level, tree and fhandle updated.
+        /// Also handles creation of split files
+        function start_tag($parser, $tag, $attrs) {
+
+        /// Update things before processing
+            $this->level++;
+            $this->tree[$this->level] = $tag;
+
+        /// Check if we need to start a new split file,
+        /// Speedup: we only do that if we haven't a fhandler and if level <= $maxsplitlevel
+            if ($this->level <= $this->maxsplitlevel && !$this->fhandler && $newfilename = $this->check_split_point($tag)) {
+            /// Open new file handler, init everything
+                $this->fhandler = fopen($this->savepath . '/' . $newfilename, 'w');
+                $this->content = '';
+                $this->cdata = '';
+                $this->trailing = '';
+            /// Build the original leading tree (and calculate the original trailing one)
+                for ($l = 1; $l < $this->level; $l++) {
+                    $this->content .= "<{$this->tree[$l]}>\n";
+                    $this->trailing = "\n</{$this->tree[$l]}>" . $this->trailing;
+                }
+            }
+        /// Perform xml-entities transformation and add to contents buffer together with opening tag.
+        /// Speedup. We lose nice formatting of the split XML but avoid 50% of transformations and XML is 100% equivalent
+            $this->content .= ($this->repectformat ? htmlspecialchars($this->cdata) : '') . "<$tag>";
+            $this->cdata = '';
+        }
+
+        /// To detect end of tags, keeping level, tree and fhandle updated, writting contents buffer to split file.
+        /// Also handles closing of split files
+        function end_tag($parser, $tag) {
+
+        /// Perform xml-entities transformation and add to contents buffer together with closing tag, repecting (or no) format
+            $this->content .= ($this->repectformat ? htmlspecialchars($this->cdata) : htmlspecialchars(trim($this->cdata))) . "</$tag>";
+            $this->cdata = '';
+
+        /// Check if we need to close current split file
+        /// Speedup: we only do that if we have a fhandler and if level <= $maxsplitlevel
+            if ($this->level <= $this->maxsplitlevel && $this->fhandler && $newfilename = $this->check_split_point($tag)) {
+            /// Write pending contents buffer before closing. It's a must
+                fwrite($this->fhandler, $this->content);
+                $this->content = "";
+            /// Write the original trailing tree for fhandler
+                fwrite($this->fhandler, $this->trailing);
+                fclose($this->fhandler);
+                $this->fhandler = null;
+            } else {
+            /// Normal write of contents (use one buffer to improve speed)
+                if ($this->fhandler && strlen($this->content) > $this->buffersize) {
+                    fwrite($this->fhandler, $this->content);
+                    $this->content = "";
+                }
+            }
+
+        /// Update things after processing
+            $this->tree[$this->level] = "";
+            $this->level--;
+
+        }
+    }
+
+    /// This function executes the moodle_splitter_parser, causing the monolithic moodle.xml
+    /// file to be splitted in n smaller files for better treatament by the MoodleParser in restore_read_xml()
+    function restore_split_xml ($xml_file, $preferences) {
+
+        $status = true;
+
+        $xml_parser = xml_parser_create('UTF-8');
+        $split_parser = new moodle_splitter_parser(dirname($xml_file));
+        xml_set_object($xml_parser,$split_parser);
+        xml_set_element_handler($xml_parser, 'start_tag', 'end_tag');
+        xml_set_character_data_handler($xml_parser, 'character_data');
+
+        $doteach = filesize($xml_file) / 20;
+        $fromdot = 0;
+
+        $fp = fopen($xml_file,"r")
+            or $status = false;
+        if ($status) {
+            $lasttime = time();
+            while ($data = fread($fp, 8192)) {
+                if (!defined('RESTORE_SILENTLY')) {
+                    $fromdot += 8192;
+                    if ($fromdot > $doteach) {
+                        echo ".";
+                        backup_flush(300);
+                        $fromdot = 0;
+                    }
+                    if ((time() - $lasttime) > 10) {
+                        $lasttime = time();
+                        backup_flush(300);
+                    }
+                }
+                xml_parse($xml_parser, $data, feof($fp))
+                    or die(sprintf("XML error: %s at line %d",
+                                   xml_error_string(xml_get_error_code($xml_parser)),
+                                   xml_get_current_line_number($xml_parser)));
+            }
+            fclose($fp);
+        }
+        xml_parser_free($xml_parser);
+        return $status;
+    }
+
      //This is the class used to do all the xml parse
      class MoodleParser {
  
@@ -5182,7 +5393,7 @@ define('RESTORE_GROUPS_GROUPINGS', 3);
          }
  
          function endElementRoles($parser, $tagName) {
-            //Check if we are into INFO zone
+            //Check if we are into ROLES zone
              if ($this->tree[2] == "ROLES") {
  
                  if ($this->tree[3] == "ROLE") {
@@ -5230,7 +5441,7 @@ define('RESTORE_GROUPS_GROUPINGS', 3);
                  }
              }
  
-            //Stop parsing if todo = INFO and tagName = INFO (en of the tag, of course)
+            //Stop parsing if todo = ROLES and tagName = ROLES (en of the tag, of course)
              //Speed up a lot (avoid parse all)
              if ($tagName == "ROLES") {
                  $this->finished = true;
@@ -7350,8 +7561,31 @@ define('RESTORE_GROUPS_GROUPINGS', 3);
      //This function executes the MoodleParser
      function restore_read_xml ($xml_file,$todo,$preferences) {
  
+        global $CFG;
+
          $status = true;
  
+    /// If enabled in the site, use split files instead of original moodle.xml file
+    /// This will speed parsing speed upto 20x.
+        if (!empty($CFG->experimentalsplitrestore)) {
+        /// Use splite file, else nothing to process (saves one full parsing for each non-existing todo)
+            $splitfile= dirname($xml_file) . '/' . strtolower('split_' . $todo . '.xml');
+            if (file_exists($splitfile)) {
+                $xml_file = $splitfile;
+                debugging("Info: todo=$todo, using split file", DEBUG_DEVELOPER);
+            } else {
+            /// For some todos, that are used in earlier restore steps (restore_precheck(), restore_form...
+            /// allow fallback to monolithic moodle.xml. Those todos are at the beggining of the xml, so
+            /// it doesn't hurts too much.
+                if ($todo == 'INFO' || $todo == 'COURSE_HEADER' || $todo == 'ROLES') {
+                    debugging("Info: todo=$todo, no split file. Fallback to moodle.xml", DEBUG_DEVELOPER);
+                } else {
+                    debugging("Info: todo=$todo, no split file. Parse skipped", DEBUG_DEVELOPER);
+                    return true;
+                }
+            }
+        }
+
          $xml_parser = xml_parser_create('UTF-8');
          $moodle_parser = new MoodleParser();
          $moodle_parser->todo = $todo;
@@ -7410,7 +7644,7 @@ define('RESTORE_GROUPS_GROUPINGS', 3);
          if ($status) {
              // MDL-9290 performance improvement on reading large xml
              $lasttime = time(); // crmas
-            while ($data = fread($fp, 4096) and !$moodle_parser->finished) {
+            while ($data = fread($fp, 8192) and !$moodle_parser->finished) {
               
                  if ((time() - $lasttime) > 5) {
                      $lasttime = time();
@@ -7744,9 +7978,26 @@ define('RESTORE_GROUPS_GROUPINGS', 3);
              echo "<ul>";
          }
  
-        //Localtion of the xml file
+        //Location of the xml file
          $xml_file = $CFG->dataroot."/temp/backup/".$restore->backup_unique_code."/moodle.xml";
  
+        //Preprocess the moodle.xml file spliting into smaller chucks (modules, users, logs...)
+        //for optimal parsing later in the restore process.
+        if (!empty($CFG->experimentalsplitrestore)) {
+            if (!defined('RESTORE_SILENTLY')) {
+                echo '<li>'.get_string('preprocessingbackupfile') . '</li>';
+            }
+            //First of all, split moodle.xml into handy files
+            if (!restore_split_xml ($xml_file, $restore)) {
+                if (!defined('RESTORE_SILENTLY')) {
+                    notify("Error proccessing moodle.xml file. Process ended.");
+                } else {
+                    $errorstr = "Error proccessing moodle.xml file. Process ended.";
+                }
+                return false;
+            }
+        }
+
          //If we've selected to restore into new course
          //create it (course)
          //Saving conversion id variables into backup_tables
diff --git a/lang/en_utf8/moodle.php b/lang/en_utf8/moodle.php

index a5949e82a891e065f535ebc2833a82b1f5e4a9b1..f518aadabced3dcf130195151c4055d2c1bbd862 100644 (file)
--- a/lang/en_utf8/moodle.php
+++ b/lang/en_utf8/moodle.php
@@ -1251,6 +1251,7 @@ $string['potentialteachers'] = 'Potential teachers';
  $string['preferences'] = 'Preferences';
  $string['preferredlanguage'] = 'Preferred language';
  $string['preferredtheme'] = 'Preferred theme';
+$string['preprocessingbackupfile'] = 'Preprocessing backup file';
  $string['preview'] = 'Preview';
  $string['previewhtml'] = 'HTML Format Preview';
  $string['previeworchoose'] = 'Preview or choose a theme';
author	stronk7 <stronk7>
	Mon, 9 Mar 2009 23:36:00 +0000 (23:36 +0000)
committer	stronk7 <stronk7>
	Mon, 9 Mar 2009 23:36:00 +0000 (23:36 +0000)
backup/restorelib.php		patch \| blob \| history
lang/en_utf8/moodle.php		patch \| blob \| history