Manual:Chris G's botclasses/ParseMirroredWikiIndexBot.php
From Linux Web Expert
This bot parses pages from the mirrored wikiindex (i.e. your local copy of everything exported from WikiIndex) to populate the parse_mirrored_wikiindex_bot.parsed_mirrored_wikiindex table.
ParseMirroredWikiIndexBot.php
<?php
/* ParseMirroredWikiIndex.php
* By Leucosticte, https://www.mediawiki.org/wiki/User:Leucosticte
* GNU Public License 2.0
*
* This bot parses pages from the mirrored wikiindex (i.e. your local copy of everything exported
* from WikiIndex) to populate the parse_mirrored_wikiindex_bot.parsed_mirrored_wikiindex table.
*/
$host = 'localhost';
$dbUser = 'root';
$dbPass = 'REMOVED';
$dbName = 'parse_mirrored_wikiindex_bot';
$tables = array(
'parsed_mirrored_wikiindex' => 'parsed-mirrored-wikiindex.sql',
);
// Connect to database
$con = new mysqli( $host, $dbUser, $dbPass );
if ( !$con ) {
die( 'Could not connect: ' . mysql_error() );
}
// Create database and select it
$con->query ( "CREATE DATABASE IF NOT EXISTS $dbName" );
$con->select_db ( "$dbName" );
// Check tables' existence
$existenceArr = array();
$existenceResult = mysql_query( "SHOW TABLES FROM $dbName" );
if ( $existenceResult ) {
while( $existenceRow = mysql_fetch_row( $existenceResult ) ) {
$existenceArr[] = $row[0];
}
}
foreach ( $tables as $table => $sqlFile ) {
echo "Checking table $table...";
if ( in_array ( $table, $existenceArr ) ) {
echo "table exists\n";
} else {
echo "not found; creating...";
if ( !file_exists( $sqlFile ) ) {
die( "Error: file $sqlFile missing!\n" );
}
$sql = file_get_contents ( $sqlFile );
$con->query ( $sql );
echo "created\m";
}
}
/* Setup my classes. */
include( 'botclasses.php' );
$wiki = new wikipedia;
$wiki->url = "http://localhost/test13/w/api.php"; // Local mirror of WikiIndex
/* All the login stuff. */
$user = 'Nate';
$pass = 'password'; // Lax security because it's just a local development box
$wiki->login( $user, $pass );
// Configuration
$pageTitlesFile = 'CategoryMembers.txt';
// Test file existence
if ( !file_exists ( $pageTitlesFile ) ) {
die ( "File $pageTitlesFile not found" );
}
// Read files
$lines = file( $pageTitlesFile, FILE_IGNORE_NEW_LINES );
$fields = array (
'name',
'URL',
'logo',
'wide logo',
# 'iw_prefix', // Hopefully WikiIndex will add these fields someday
# 'iw_url',
'recentchanges URL',
'wikinode URL',
'status',
'language',
'editmode',
'engine',
'license',
'maintopic',
'backupurl',
'backupdate',
'pages',
'statistics URL',
'wikiFactor',
'wikiFactor URL',
);
$startWith = "";
$reachedStartWith = false;
$errors = '';
// Parse those templates
foreach ( $lines as $line ) {
if ( $reachedStartWith || !$startWith || $line == $startWith ) {
$reachedStartWith = true;
$contents = $wiki->getpage ( $line );
// Clear the comments out of there
while ( ($commentBegin = strpos ( $contents, '<!--' ) ) !== false ) {
$commentEnd = strpos ( $contents, '-->', $commentBegin );
if ( !$commentEnd ) {
$commentEnd = strlen ( $contents ) - 1;
}
if ( $commentEnd ) {
$contents = str_replace ( substr ( $contents, $commentBegin,
$commentEnd - $commentBegin + 3 ), '', $contents );
}
}
$rowFields = array ();
foreach ( $fields as $field ) {
// A template line with data will start with a |, e.g. |name = MisesWiki
if ( $begin = strpos ( $contents, '|' . $field ) ) {
// Then of course there's an equal sign for each template argument
if ( $equals = strpos ( $contents, '=', $begin ) ) {
$closingBrace = strpos ( $contents, "}}", $equals );
// End this string at the newline or the }} (closing the template), whichever
// comes first
$newline = strpos ( $contents, "\n", $equals );
if ( $closingBrace && $closingBrace < $newline ) {
$newline = $closingBrace;
}
if ( $newline ) {
$rowFields[$field] = trim( substr ( $contents, $equals + 1,
$newline - $equals - 1 ) );
// Nested templates will cause problems
if ( strpos ( $rowFields[$field], '{{' ) ) {
$errors .= "Error: Template included. Wiki: $line. Field: "
. $rowFields[$field] . "\n";
}
}
}
}
}
echo $contents . "\n";
// Store data in database
if ( $rowFields ) {
$queryValues = $con->real_escape_string ( serialize ( $rowFields ) );
$query = "INSERT INTO parsed_mirrored_wikiindex (pmw_wikiindex_page_title, pmw_data) "
. "VALUES ('" . $con->real_escape_string ( $line ) . "', '$queryValues')";
echo $query ."\n";
$success = $con->query ( $query );
echo $success;
if ( !$success ) {
$errors .= "Failure inserting $query\n";
}
}
}
}
echo $errors; // If there were any errors along the way, break the bad news at the last minute
parsed-mirrored-wikiindex.sql
-- Stores data gathered from WikiIndex and 20,000 wiki APIs
CREATE TABLE parsed_mirrored_wikiindex(
-- Primary key
pmw_id INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
-- WikiIndex page title, used because it's sure to be unique
pmw_wikiindex_page_title VARCHAR(255) BINARY,
-- Serialized data from WikiIndex pages and 20,000 wiki APIs
pmw_data mediumblob
);