From: stronk7 <stronk7>
Date: Mon, 9 Mar 2009 23:36:00 +0000 (+0000)
Subject: MDL-18468 restore preprocessing (split) - Added $CFG->experimentalsplitrestore option
X-Git-Url: http://git.mjollnir.org/gw?a=commitdiff_plain;h=b8f0d7d18591f3ca3c0bceb158571c3b47eca876;p=moodle.git

MDL-18468 restore preprocessing (split) - Added $CFG->experimentalsplitrestore option
to enable split of moodle.xml files into upto 19 smaller files for quicker restore.
Merged from 19_STABLE
---

diff --git a/backup/restorelib.php b/backup/restorelib.php
index 9b81966238..3749903732 100644
--- a/backup/restorelib.php
+++ b/backup/restorelib.php
@@ -4581,6 +4581,217 @@ define('RESTORE_GROUPS_GROUPINGS', 3);
     //==                                                                                 ==
     //=====================================================================================
 
+    /// This is the class used to split, in first instance, the monolithic moodle.xml into
+    /// smaller xml files allowing the MoodleParser later to process only the required info
+    /// based in each TODO, instead of processing the whole xml for each TODO. In theory
+    /// processing time can be reduced upto 1/20th of original time (depending of the
+    /// number of TODOs in the original moodle.xml file)
+    ///
+    /// Anyway, note it's a general splitter parser, and only needs to be instantiated
+    /// with the proper destination dir and the tosplit configuration. Be careful when
+    /// using it because it doesn't support XML attributes nor real cdata out from tags.
+    /// (both not used in the target Moodle backup files)
+
+    class moodle_splitter_parser {
+        var $level = 0;            /// Level we are
+        var $tree = array();       /// Array of levels we are
+        var $cdata = '';           /// Raw storage for character data
+        var $content = '';         /// Content buffer to be printed to file
+        var $trailing= '';         /// Content of the trailing tree for each splited file
+        var $savepath = null;      /// Path to store splited files
+        var $fhandler = null;      /// Current file we are writing to
+        var $tosplit = array();    /// Array defining the files we want to split, in this format:
+                                   /// array( level/tag/level/tag => filename)
+        var $splitwords = array(); /// Denormalised array containing the potential tags
+                                   /// being a split point. To speed up check_split_point()
+        var $maxsplitlevel = 0;    /// Precalculated max level where any split happens. To speed up check_split_point()
+        var $buffersize = 65536;   /// 64KB is a good write buffer. Don't expect big benefits by increasing this.
+        var $repectformat = false; /// With this setting enabled, the splited files will look like the original one
+                                   /// with all the indentations 100% copied from original (character data outer tags).
+                                   /// But this is a waste of time from our perspective, and splited xml files are completely
+                                   /// functional without that, so we disable this for production, generating a more compact
+                                   /// XML quicker
+
+    /// PHP4 constructor
+        function moodle_splitter_parser($savepath, $tosplit = null) {
+            return $this->__construct($savepath, $tosplit);
+        }
+
+    /// PHP5 constructor
+        function __construct($savepath, $tosplit = null) {
+            $this->savepath = $savepath;
+            if (!empty($tosplit)) {
+                $this->tosplit = $tosplit;
+            } else { /// No tosplit list passed, process all the possible parts in one moodle.xml file
+                $this->tosplit = array(
+                                     '1/MOODLE_BACKUP/2/INFO'        => 'split_info.xml',
+                                     '1/MOODLE_BACKUP/2/ROLES'       => 'split_roles.xml',
+                                     '2/COURSE/3/HEADER'             => 'split_course_header.xml',
+                                     '2/COURSE/3/BLOCKS'             => 'split_blocks.xml',
+                                     '2/COURSE/3/SECTIONS'           => 'split_sections.xml',
+                                     '2/COURSE/3/FORMATDATA'         => 'split_formatdata.xml',
+                                     '2/COURSE/3/METACOURSE'         => 'split_metacourse.xml',
+                                     '2/COURSE/3/GRADEBOOK'          => 'split_gradebook.xml',
+                                     '2/COURSE/3/USERS'              => 'split_users.xml',
+                                     '2/COURSE/3/MESSAGES'           => 'split_messages.xml',
+                                     '2/COURSE/3/BLOGS'              => 'split_blogs.xml',
+                                     '2/COURSE/3/QUESTION_CATEGORIES'=> 'split_questions.xml',
+                                     '2/COURSE/3/SCALES'             => 'split_scales.xml',
+                                     '2/COURSE/3/GROUPS'             => 'split_groups.xml',
+                                     '2/COURSE/3/GROUPINGS'          => 'split_groupings.xml',
+                                     '2/COURSE/3/GROUPINGSGROUPS'    => 'split_groupingsgroups.xml',
+                                     '2/COURSE/3/EVENTS'             => 'split_events.xml',
+                                     '2/COURSE/3/MODULES'            => 'split_modules.xml',
+                                     '2/COURSE/3/LOGS'               => 'split_logs.xml'
+                                 );
+            }
+        /// Precalculate some info used to speedup checks
+            foreach ($this->tosplit as $key=>$value) {
+                $this->splitwords[basename($key)] = true;
+                if (((int) basename(dirname($key))) > $this->maxsplitlevel) {
+                    $this->maxsplitlevel = (int) basename(dirname($key));
+                }
+            }
+        }
+
+        /// Given one tag being opened, check if it's one split point.
+        /// Return false or split filename
+        function check_split_point($tag) {
+        /// Quick check. Level < 2 cannot be a split point
+            if ($this->level < 2) {
+                return false;
+            }
+        /// Quick check. Current tag against potential splitwords
+            if (!isset($this->splitwords[$tag])) {
+                return false;
+            }
+        /// Prev test passed, take a look to 2-level tosplit
+            $keytocheck = ($this->level - 1) . '/' . $this->tree[$this->level - 1] . '/' . $this->level . '/' . $this->tree[$this->level];
+            if (!isset($this->tosplit[$keytocheck])) {
+                return false;
+            }
+        /// Prev test passed, we are in a split point, return new filename
+            return $this->tosplit[$keytocheck];
+        }
+
+        /// To append data (xml-escaped) to contents buffer
+        function character_data($parser, $data) {
+
+            ///$this->content .= preg_replace($this->entity_find, $this->entity_replace, $data); ///40% slower
+            ///$this->content .= str_replace($this->entity_find, $this->entity_replace, $data);  ///25% slower
+            ///$this->content .= htmlspecialchars($data);                                        ///the best
+            /// Instead of htmlspecialchars() each chunk of character data, we are going to
+            /// concat it without transformation and will apply the htmlspecialchars() when
+            /// that character data is, efectively, going to be added to contents buffer. This
+            /// makes the number of transformations to be reduced (speedup) and avoid potential
+            /// problems with transformations being applied "in the middle" of multibyte chars.
+            $this->cdata .= $data;
+        }
+
+        /// To detect start of tags, keeping level, tree and fhandle updated.
+        /// Also handles creation of split files
+        function start_tag($parser, $tag, $attrs) {
+
+        /// Update things before processing
+            $this->level++;
+            $this->tree[$this->level] = $tag;
+
+        /// Check if we need to start a new split file,
+        /// Speedup: we only do that if we haven't a fhandler and if level <= $maxsplitlevel
+            if ($this->level <= $this->maxsplitlevel && !$this->fhandler && $newfilename = $this->check_split_point($tag)) {
+            /// Open new file handler, init everything
+                $this->fhandler = fopen($this->savepath . '/' . $newfilename, 'w');
+                $this->content = '';
+                $this->cdata = '';
+                $this->trailing = '';
+            /// Build the original leading tree (and calculate the original trailing one)
+                for ($l = 1; $l < $this->level; $l++) {
+                    $this->content .= "<{$this->tree[$l]}>\n";
+                    $this->trailing = "\n</{$this->tree[$l]}>" . $this->trailing;
+                }
+            }
+        /// Perform xml-entities transformation and add to contents buffer together with opening tag.
+        /// Speedup. We lose nice formatting of the split XML but avoid 50% of transformations and XML is 100% equivalent
+            $this->content .= ($this->repectformat ? htmlspecialchars($this->cdata) : '') . "<$tag>";
+            $this->cdata = '';
+        }
+
+        /// To detect end of tags, keeping level, tree and fhandle updated, writting contents buffer to split file.
+        /// Also handles closing of split files
+        function end_tag($parser, $tag) {
+
+        /// Perform xml-entities transformation and add to contents buffer together with closing tag, repecting (or no) format
+            $this->content .= ($this->repectformat ? htmlspecialchars($this->cdata) : htmlspecialchars(trim($this->cdata))) . "</$tag>";
+            $this->cdata = '';
+
+        /// Check if we need to close current split file
+        /// Speedup: we only do that if we have a fhandler and if level <= $maxsplitlevel
+            if ($this->level <= $this->maxsplitlevel && $this->fhandler && $newfilename = $this->check_split_point($tag)) {
+            /// Write pending contents buffer before closing. It's a must
+                fwrite($this->fhandler, $this->content);
+                $this->content = "";
+            /// Write the original trailing tree for fhandler
+                fwrite($this->fhandler, $this->trailing);
+                fclose($this->fhandler);
+                $this->fhandler = null;
+            } else {
+            /// Normal write of contents (use one buffer to improve speed)
+                if ($this->fhandler && strlen($this->content) > $this->buffersize) {
+                    fwrite($this->fhandler, $this->content);
+                    $this->content = "";
+                }
+            }
+
+        /// Update things after processing
+            $this->tree[$this->level] = "";
+            $this->level--;
+
+        }
+    }
+
+    /// This function executes the moodle_splitter_parser, causing the monolithic moodle.xml
+    /// file to be splitted in n smaller files for better treatament by the MoodleParser in restore_read_xml()
+    function restore_split_xml ($xml_file, $preferences) {
+
+        $status = true;
+
+        $xml_parser = xml_parser_create('UTF-8');
+        $split_parser = new moodle_splitter_parser(dirname($xml_file));
+        xml_set_object($xml_parser,$split_parser);
+        xml_set_element_handler($xml_parser, 'start_tag', 'end_tag');
+        xml_set_character_data_handler($xml_parser, 'character_data');
+
+        $doteach = filesize($xml_file) / 20;
+        $fromdot = 0;
+
+        $fp = fopen($xml_file,"r")
+            or $status = false;
+        if ($status) {
+            $lasttime = time();
+            while ($data = fread($fp, 8192)) {
+                if (!defined('RESTORE_SILENTLY')) {
+                    $fromdot += 8192;
+                    if ($fromdot > $doteach) {
+                        echo ".";
+                        backup_flush(300);
+                        $fromdot = 0;
+                    }
+                    if ((time() - $lasttime) > 10) {
+                        $lasttime = time();
+                        backup_flush(300);
+                    }
+                }
+                xml_parse($xml_parser, $data, feof($fp))
+                    or die(sprintf("XML error: %s at line %d",
+                                   xml_error_string(xml_get_error_code($xml_parser)),
+                                   xml_get_current_line_number($xml_parser)));
+            }
+            fclose($fp);
+        }
+        xml_parser_free($xml_parser);
+        return $status;
+    }
+
     //This is the class used to do all the xml parse
     class MoodleParser {
 
@@ -5182,7 +5393,7 @@ define('RESTORE_GROUPS_GROUPINGS', 3);
         }
 
         function endElementRoles($parser, $tagName) {
-            //Check if we are into INFO zone
+            //Check if we are into ROLES zone
             if ($this->tree[2] == "ROLES") {
 
                 if ($this->tree[3] == "ROLE") {
@@ -5230,7 +5441,7 @@ define('RESTORE_GROUPS_GROUPINGS', 3);
                 }
             }
 
-            //Stop parsing if todo = INFO and tagName = INFO (en of the tag, of course)
+            //Stop parsing if todo = ROLES and tagName = ROLES (en of the tag, of course)
             //Speed up a lot (avoid parse all)
             if ($tagName == "ROLES") {
                 $this->finished = true;
@@ -7350,8 +7561,31 @@ define('RESTORE_GROUPS_GROUPINGS', 3);
     //This function executes the MoodleParser
     function restore_read_xml ($xml_file,$todo,$preferences) {
 
+        global $CFG;
+
         $status = true;
 
+    /// If enabled in the site, use split files instead of original moodle.xml file
+    /// This will speed parsing speed upto 20x.
+        if (!empty($CFG->experimentalsplitrestore)) {
+        /// Use splite file, else nothing to process (saves one full parsing for each non-existing todo)
+            $splitfile= dirname($xml_file) . '/' . strtolower('split_' . $todo . '.xml');
+            if (file_exists($splitfile)) {
+                $xml_file = $splitfile;
+                debugging("Info: todo=$todo, using split file", DEBUG_DEVELOPER);
+            } else {
+            /// For some todos, that are used in earlier restore steps (restore_precheck(), restore_form...
+            /// allow fallback to monolithic moodle.xml. Those todos are at the beggining of the xml, so
+            /// it doesn't hurts too much.
+                if ($todo == 'INFO' || $todo == 'COURSE_HEADER' || $todo == 'ROLES') {
+                    debugging("Info: todo=$todo, no split file. Fallback to moodle.xml", DEBUG_DEVELOPER);
+                } else {
+                    debugging("Info: todo=$todo, no split file. Parse skipped", DEBUG_DEVELOPER);
+                    return true;
+                }
+            }
+        }
+
         $xml_parser = xml_parser_create('UTF-8');
         $moodle_parser = new MoodleParser();
         $moodle_parser->todo = $todo;
@@ -7410,7 +7644,7 @@ define('RESTORE_GROUPS_GROUPINGS', 3);
         if ($status) {
             // MDL-9290 performance improvement on reading large xml
             $lasttime = time(); // crmas
-            while ($data = fread($fp, 4096) and !$moodle_parser->finished) {
+            while ($data = fread($fp, 8192) and !$moodle_parser->finished) {
              
                 if ((time() - $lasttime) > 5) {
                     $lasttime = time();
@@ -7744,9 +7978,26 @@ define('RESTORE_GROUPS_GROUPINGS', 3);
             echo "<ul>";
         }
 
-        //Localtion of the xml file
+        //Location of the xml file
         $xml_file = $CFG->dataroot."/temp/backup/".$restore->backup_unique_code."/moodle.xml";
 
+        //Preprocess the moodle.xml file spliting into smaller chucks (modules, users, logs...)
+        //for optimal parsing later in the restore process.
+        if (!empty($CFG->experimentalsplitrestore)) {
+            if (!defined('RESTORE_SILENTLY')) {
+                echo '<li>'.get_string('preprocessingbackupfile') . '</li>';
+            }
+            //First of all, split moodle.xml into handy files
+            if (!restore_split_xml ($xml_file, $restore)) {
+                if (!defined('RESTORE_SILENTLY')) {
+                    notify("Error proccessing moodle.xml file. Process ended.");
+                } else {
+                    $errorstr = "Error proccessing moodle.xml file. Process ended.";
+                }
+                return false;
+            }
+        }
+
         //If we've selected to restore into new course
         //create it (course)
         //Saving conversion id variables into backup_tables
diff --git a/lang/en_utf8/moodle.php b/lang/en_utf8/moodle.php
index a5949e82a8..f518aadabc 100644
--- a/lang/en_utf8/moodle.php
+++ b/lang/en_utf8/moodle.php
@@ -1251,6 +1251,7 @@ $string['potentialteachers'] = 'Potential teachers';
 $string['preferences'] = 'Preferences';
 $string['preferredlanguage'] = 'Preferred language';
 $string['preferredtheme'] = 'Preferred theme';
+$string['preprocessingbackupfile'] = 'Preprocessing backup file';
 $string['preview'] = 'Preview';
 $string['previewhtml'] = 'HTML Format Preview';
 $string['previeworchoose'] = 'Preview or choose a theme';