From: stronk7 Date: Mon, 9 Mar 2009 23:36:00 +0000 (+0000) Subject: MDL-18468 restore preprocessing (split) - Added $CFG->experimentalsplitrestore option X-Git-Url: http://git.mjollnir.org/gw?a=commitdiff_plain;h=b8f0d7d18591f3ca3c0bceb158571c3b47eca876;p=moodle.git MDL-18468 restore preprocessing (split) - Added $CFG->experimentalsplitrestore option to enable split of moodle.xml files into upto 19 smaller files for quicker restore. Merged from 19_STABLE --- diff --git a/backup/restorelib.php b/backup/restorelib.php index 9b81966238..3749903732 100644 --- a/backup/restorelib.php +++ b/backup/restorelib.php @@ -4581,6 +4581,217 @@ define('RESTORE_GROUPS_GROUPINGS', 3); //== == //===================================================================================== + /// This is the class used to split, in first instance, the monolithic moodle.xml into + /// smaller xml files allowing the MoodleParser later to process only the required info + /// based in each TODO, instead of processing the whole xml for each TODO. In theory + /// processing time can be reduced upto 1/20th of original time (depending of the + /// number of TODOs in the original moodle.xml file) + /// + /// Anyway, note it's a general splitter parser, and only needs to be instantiated + /// with the proper destination dir and the tosplit configuration. Be careful when + /// using it because it doesn't support XML attributes nor real cdata out from tags. + /// (both not used in the target Moodle backup files) + + class moodle_splitter_parser { + var $level = 0; /// Level we are + var $tree = array(); /// Array of levels we are + var $cdata = ''; /// Raw storage for character data + var $content = ''; /// Content buffer to be printed to file + var $trailing= ''; /// Content of the trailing tree for each splited file + var $savepath = null; /// Path to store splited files + var $fhandler = null; /// Current file we are writing to + var $tosplit = array(); /// Array defining the files we want to split, in this format: + /// array( level/tag/level/tag => filename) + var $splitwords = array(); /// Denormalised array containing the potential tags + /// being a split point. To speed up check_split_point() + var $maxsplitlevel = 0; /// Precalculated max level where any split happens. To speed up check_split_point() + var $buffersize = 65536; /// 64KB is a good write buffer. Don't expect big benefits by increasing this. + var $repectformat = false; /// With this setting enabled, the splited files will look like the original one + /// with all the indentations 100% copied from original (character data outer tags). + /// But this is a waste of time from our perspective, and splited xml files are completely + /// functional without that, so we disable this for production, generating a more compact + /// XML quicker + + /// PHP4 constructor + function moodle_splitter_parser($savepath, $tosplit = null) { + return $this->__construct($savepath, $tosplit); + } + + /// PHP5 constructor + function __construct($savepath, $tosplit = null) { + $this->savepath = $savepath; + if (!empty($tosplit)) { + $this->tosplit = $tosplit; + } else { /// No tosplit list passed, process all the possible parts in one moodle.xml file + $this->tosplit = array( + '1/MOODLE_BACKUP/2/INFO' => 'split_info.xml', + '1/MOODLE_BACKUP/2/ROLES' => 'split_roles.xml', + '2/COURSE/3/HEADER' => 'split_course_header.xml', + '2/COURSE/3/BLOCKS' => 'split_blocks.xml', + '2/COURSE/3/SECTIONS' => 'split_sections.xml', + '2/COURSE/3/FORMATDATA' => 'split_formatdata.xml', + '2/COURSE/3/METACOURSE' => 'split_metacourse.xml', + '2/COURSE/3/GRADEBOOK' => 'split_gradebook.xml', + '2/COURSE/3/USERS' => 'split_users.xml', + '2/COURSE/3/MESSAGES' => 'split_messages.xml', + '2/COURSE/3/BLOGS' => 'split_blogs.xml', + '2/COURSE/3/QUESTION_CATEGORIES'=> 'split_questions.xml', + '2/COURSE/3/SCALES' => 'split_scales.xml', + '2/COURSE/3/GROUPS' => 'split_groups.xml', + '2/COURSE/3/GROUPINGS' => 'split_groupings.xml', + '2/COURSE/3/GROUPINGSGROUPS' => 'split_groupingsgroups.xml', + '2/COURSE/3/EVENTS' => 'split_events.xml', + '2/COURSE/3/MODULES' => 'split_modules.xml', + '2/COURSE/3/LOGS' => 'split_logs.xml' + ); + } + /// Precalculate some info used to speedup checks + foreach ($this->tosplit as $key=>$value) { + $this->splitwords[basename($key)] = true; + if (((int) basename(dirname($key))) > $this->maxsplitlevel) { + $this->maxsplitlevel = (int) basename(dirname($key)); + } + } + } + + /// Given one tag being opened, check if it's one split point. + /// Return false or split filename + function check_split_point($tag) { + /// Quick check. Level < 2 cannot be a split point + if ($this->level < 2) { + return false; + } + /// Quick check. Current tag against potential splitwords + if (!isset($this->splitwords[$tag])) { + return false; + } + /// Prev test passed, take a look to 2-level tosplit + $keytocheck = ($this->level - 1) . '/' . $this->tree[$this->level - 1] . '/' . $this->level . '/' . $this->tree[$this->level]; + if (!isset($this->tosplit[$keytocheck])) { + return false; + } + /// Prev test passed, we are in a split point, return new filename + return $this->tosplit[$keytocheck]; + } + + /// To append data (xml-escaped) to contents buffer + function character_data($parser, $data) { + + ///$this->content .= preg_replace($this->entity_find, $this->entity_replace, $data); ///40% slower + ///$this->content .= str_replace($this->entity_find, $this->entity_replace, $data); ///25% slower + ///$this->content .= htmlspecialchars($data); ///the best + /// Instead of htmlspecialchars() each chunk of character data, we are going to + /// concat it without transformation and will apply the htmlspecialchars() when + /// that character data is, efectively, going to be added to contents buffer. This + /// makes the number of transformations to be reduced (speedup) and avoid potential + /// problems with transformations being applied "in the middle" of multibyte chars. + $this->cdata .= $data; + } + + /// To detect start of tags, keeping level, tree and fhandle updated. + /// Also handles creation of split files + function start_tag($parser, $tag, $attrs) { + + /// Update things before processing + $this->level++; + $this->tree[$this->level] = $tag; + + /// Check if we need to start a new split file, + /// Speedup: we only do that if we haven't a fhandler and if level <= $maxsplitlevel + if ($this->level <= $this->maxsplitlevel && !$this->fhandler && $newfilename = $this->check_split_point($tag)) { + /// Open new file handler, init everything + $this->fhandler = fopen($this->savepath . '/' . $newfilename, 'w'); + $this->content = ''; + $this->cdata = ''; + $this->trailing = ''; + /// Build the original leading tree (and calculate the original trailing one) + for ($l = 1; $l < $this->level; $l++) { + $this->content .= "<{$this->tree[$l]}>\n"; + $this->trailing = "\ntree[$l]}>" . $this->trailing; + } + } + /// Perform xml-entities transformation and add to contents buffer together with opening tag. + /// Speedup. We lose nice formatting of the split XML but avoid 50% of transformations and XML is 100% equivalent + $this->content .= ($this->repectformat ? htmlspecialchars($this->cdata) : '') . "<$tag>"; + $this->cdata = ''; + } + + /// To detect end of tags, keeping level, tree and fhandle updated, writting contents buffer to split file. + /// Also handles closing of split files + function end_tag($parser, $tag) { + + /// Perform xml-entities transformation and add to contents buffer together with closing tag, repecting (or no) format + $this->content .= ($this->repectformat ? htmlspecialchars($this->cdata) : htmlspecialchars(trim($this->cdata))) . ""; + $this->cdata = ''; + + /// Check if we need to close current split file + /// Speedup: we only do that if we have a fhandler and if level <= $maxsplitlevel + if ($this->level <= $this->maxsplitlevel && $this->fhandler && $newfilename = $this->check_split_point($tag)) { + /// Write pending contents buffer before closing. It's a must + fwrite($this->fhandler, $this->content); + $this->content = ""; + /// Write the original trailing tree for fhandler + fwrite($this->fhandler, $this->trailing); + fclose($this->fhandler); + $this->fhandler = null; + } else { + /// Normal write of contents (use one buffer to improve speed) + if ($this->fhandler && strlen($this->content) > $this->buffersize) { + fwrite($this->fhandler, $this->content); + $this->content = ""; + } + } + + /// Update things after processing + $this->tree[$this->level] = ""; + $this->level--; + + } + } + + /// This function executes the moodle_splitter_parser, causing the monolithic moodle.xml + /// file to be splitted in n smaller files for better treatament by the MoodleParser in restore_read_xml() + function restore_split_xml ($xml_file, $preferences) { + + $status = true; + + $xml_parser = xml_parser_create('UTF-8'); + $split_parser = new moodle_splitter_parser(dirname($xml_file)); + xml_set_object($xml_parser,$split_parser); + xml_set_element_handler($xml_parser, 'start_tag', 'end_tag'); + xml_set_character_data_handler($xml_parser, 'character_data'); + + $doteach = filesize($xml_file) / 20; + $fromdot = 0; + + $fp = fopen($xml_file,"r") + or $status = false; + if ($status) { + $lasttime = time(); + while ($data = fread($fp, 8192)) { + if (!defined('RESTORE_SILENTLY')) { + $fromdot += 8192; + if ($fromdot > $doteach) { + echo "."; + backup_flush(300); + $fromdot = 0; + } + if ((time() - $lasttime) > 10) { + $lasttime = time(); + backup_flush(300); + } + } + xml_parse($xml_parser, $data, feof($fp)) + or die(sprintf("XML error: %s at line %d", + xml_error_string(xml_get_error_code($xml_parser)), + xml_get_current_line_number($xml_parser))); + } + fclose($fp); + } + xml_parser_free($xml_parser); + return $status; + } + //This is the class used to do all the xml parse class MoodleParser { @@ -5182,7 +5393,7 @@ define('RESTORE_GROUPS_GROUPINGS', 3); } function endElementRoles($parser, $tagName) { - //Check if we are into INFO zone + //Check if we are into ROLES zone if ($this->tree[2] == "ROLES") { if ($this->tree[3] == "ROLE") { @@ -5230,7 +5441,7 @@ define('RESTORE_GROUPS_GROUPINGS', 3); } } - //Stop parsing if todo = INFO and tagName = INFO (en of the tag, of course) + //Stop parsing if todo = ROLES and tagName = ROLES (en of the tag, of course) //Speed up a lot (avoid parse all) if ($tagName == "ROLES") { $this->finished = true; @@ -7350,8 +7561,31 @@ define('RESTORE_GROUPS_GROUPINGS', 3); //This function executes the MoodleParser function restore_read_xml ($xml_file,$todo,$preferences) { + global $CFG; + $status = true; + /// If enabled in the site, use split files instead of original moodle.xml file + /// This will speed parsing speed upto 20x. + if (!empty($CFG->experimentalsplitrestore)) { + /// Use splite file, else nothing to process (saves one full parsing for each non-existing todo) + $splitfile= dirname($xml_file) . '/' . strtolower('split_' . $todo . '.xml'); + if (file_exists($splitfile)) { + $xml_file = $splitfile; + debugging("Info: todo=$todo, using split file", DEBUG_DEVELOPER); + } else { + /// For some todos, that are used in earlier restore steps (restore_precheck(), restore_form... + /// allow fallback to monolithic moodle.xml. Those todos are at the beggining of the xml, so + /// it doesn't hurts too much. + if ($todo == 'INFO' || $todo == 'COURSE_HEADER' || $todo == 'ROLES') { + debugging("Info: todo=$todo, no split file. Fallback to moodle.xml", DEBUG_DEVELOPER); + } else { + debugging("Info: todo=$todo, no split file. Parse skipped", DEBUG_DEVELOPER); + return true; + } + } + } + $xml_parser = xml_parser_create('UTF-8'); $moodle_parser = new MoodleParser(); $moodle_parser->todo = $todo; @@ -7410,7 +7644,7 @@ define('RESTORE_GROUPS_GROUPINGS', 3); if ($status) { // MDL-9290 performance improvement on reading large xml $lasttime = time(); // crmas - while ($data = fread($fp, 4096) and !$moodle_parser->finished) { + while ($data = fread($fp, 8192) and !$moodle_parser->finished) { if ((time() - $lasttime) > 5) { $lasttime = time(); @@ -7744,9 +7978,26 @@ define('RESTORE_GROUPS_GROUPINGS', 3); echo "