1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
|
<?php
/**
* @file tools/harvest.php
*
* Copyright (c) 2005-2008 Alec Smecher and John Willinsky
* Distributed under the GNU GPL v2. For full terms see the file docs/COPYING.
*
* @class harvest
* @ingroup tools
*
* @brief CLI tool to harvest an archive.
*
*/
// $Id: harvest.php,v 1.21 2009/01/20 17:22:46 asmecher Exp $
define('INDEX_FILE_LOCATION', dirname(dirname(__FILE__)) . '/index.php');
require(dirname(dirname(__FILE__)) . '/lib/pkp/classes/cliTool/CliTool.inc.php');
class harvest extends CommandLineTool {
/** @var $firstParam mixed */
var $firstParam;
/** @var $archives array */
var $archives;
/** @var $params array */
var $params;
function harvest($argv = array()) {
parent::CommandLineTool($argv);
array_shift($argv); // Clear the tool name from argv
$this->firstParam = array_shift($argv);
$archiveDao =& DAORegistry::getDAO('ArchiveDAO');
if ($this->firstParam === 'all') {
$this->archives =& $archiveDao->getArchives();
}
else {
$archive =& $archiveDao->getArchive((int) $this->firstParam, false);
if ($archive) {
$archives = array(&$archive);
import('core.ArrayItemIterator');
$this->archives = new ArrayItemIterator($archives);
} else {
$this->archives = null; // Invalid ID specified
}
}
// Set the various flags for the parser, if supported.
$this->params = array();
foreach ($argv as $arg) switch ($arg) {
case 'verbose':
$this->params['callback'] = array(&$this, 'statusCallback');
default:
if (($i = strpos($arg, '=')) !== false) {
// Treat the parameter like a name=value pair
$paramName = substr($arg, 0, $i);
$paramValue = substr($arg, $i+1);
if (!isset($this->params[$paramName])) {
$this->params[$paramName] = $paramValue;
} else {
if (is_array($this->params[$paramName])) $this->params[$paramName][] = $paramValue;
else $this->params[$paramName] = array($this->params[$paramName], $paramValue);
}
} else {
// Treat the parameter like a boolean.
$this->params[$arg] = true;
}
break;
}
}
/**
* Print command usage information.
*/
function usage() {
echo "Script to harvest an archive\n"
. "Usage: {$this->scriptName} [archive ID] [flags]\n"
. "If the specified archive ID is \"list\", a list will be displayed.\n"
. "If the specified archive ID is \"all\", all archives will be harvested.\n"
. "Flags include:\n"
. "\tverbose: Display status information during the harvest.\n"
. "\tflush: Flush the contents of the archive before harvesting.\n"
. "\tusage: Display additional usage information for the particular archive\n"
. "\tskipIndexing: Skip flushing and creation of search indexing\n"
. "Additional flags for each harvester can be listed using:\n"
. "\t{$this->scriptName} [archive ID] usage\n\n"
. "For example, to update all records using the OAI harvester:\n"
. "\t{$this->scriptName} all from=last\n";
}
/**
* Rebuild the search index for all articles in all journals.
*/
function execute() {
@set_time_limit(0);
$hadErrors = false;
if ($this->archives) while ($archive =& $this->archives->next()) {
$recordDao =& DAORegistry::getDAO('RecordDAO');
// Get the archive plugin
$plugins =& PluginRegistry::loadCategory('harvesters');
$pluginName = $archive->getHarvesterPluginName();
if (!isset($plugins[$pluginName])) {
echo "Unknown harvester plugin \"$pluginName\"!\n";
return false;
}
$plugin = $plugins[$pluginName];
if (isset($this->params['usage'])) {
$this->usage();
$plugin->describeOptions();
return true;
}
echo 'Selected archive: ' . $archive->getTitle() . "\n";
$oldRecordCount = $recordDao->getRecordCount($archive->getArchiveId());
if (isset($this->params['flush'])) {
echo 'Flushing metadata index for archive... ';
$recordDao->deleteRecordsByArchiveId(
$archive->getArchiveId(),
!isset($this->params['skipIndexing'])
);
echo $oldRecordCount . " records deleted.\n";
$oldRecordCount = 0;
}
$fetchStartTime = time();
echo "Fetching records...\n";
$plugin->updateIndex($archive, $this->params);
$fetchEndTime = time();
$timeElapsed = $fetchEndTime - $fetchStartTime;
$recordCount = $recordDao->getRecordCount($archive->getArchiveId());
$harvestedRecords = $recordCount - $oldRecordCount;
if ($timeElapsed > 0) $recordsPerSecond = $harvestedRecords / $timeElapsed;
else $recordsPerSecond = 0;
$recordsPerSecond = number_format($recordsPerSecond, 2);
echo "Finished:\n";
echo "\t$harvestedRecords records indexed\n";
echo "\t$timeElapsed seconds elapsed\n";
echo "\t$recordsPerSecond records per second\n";
echo "\t$oldRecordCount records kept from past harvests\n";
echo "\t$recordCount records total.\n";
if ($errors = $plugin->getErrors()) {
echo "Errors/Warnings:\n";
foreach (array_unique($errors) as $error) {
echo "\t$error\n";
}
$plugin->clearErrors();
$hadErrors = true;
echo "\n";
}
unset($archive);
} else {
if ($this->firstParam == '' || $this->firstParam === 'help' || $this->firstParam === 'usage') {
$this->usage();
return true;
}
// No archive was specified or the specified ID was invalid.
// Display a list of archives.
$archiveDao =& DAORegistry::getDAO('ArchiveDAO');
$recordDao =& DAORegistry::getDAO('RecordDAO');
$archives =& $archiveDao->getArchives();
echo "Archive List\n";
echo "------------\n";
while ($archive =& $archives->next()) {
$recordCount = $recordDao->getRecordCount($archive->getArchiveId());
echo $archive->getArchiveId() . ': ' . $archive->getTitle() . " ($recordCount records)\n";
unset($archive);
}
return false;
}
return !$hadErrors;
}
function statusCallback($message) {
echo "$message\n";
}
}
$tool = new harvest(isset($argv) ? $argv : array());
$tool->execute();
?> |
Partager