Manual:Chris G's botclasses/ParseMirroredWikiIndexBot.php

From Linux Web Expert

Revision as of 17:05, 20 March 2024 by imported>Pppery (alt) (Replace source with syntaxhighlight, replaced: <source → <syntaxhighlight (2), </source> → </syntaxhighlight> (2))
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

This bot parses pages from the mirrored wikiindex (i.e. your local copy of everything exported from WikiIndex) to populate the parse_mirrored_wikiindex_bot.parsed_mirrored_wikiindex table.

ParseMirroredWikiIndexBot.php

<?php
/* ParseMirroredWikiIndex.php
 * By Leucosticte, https://www.mediawiki.org/wiki/User:Leucosticte
 * GNU Public License 2.0
 *
 * This bot parses pages from the mirrored wikiindex (i.e. your local copy of everything exported
 * from WikiIndex) to populate the parse_mirrored_wikiindex_bot.parsed_mirrored_wikiindex table.
 */

$host = 'localhost';
$dbUser = 'root';
$dbPass = 'REMOVED';
$dbName = 'parse_mirrored_wikiindex_bot';
$tables = array(
    'parsed_mirrored_wikiindex' => 'parsed-mirrored-wikiindex.sql',
);

// Connect to database
$con = new mysqli( $host, $dbUser, $dbPass );
if ( !$con ) {
      die( 'Could not connect: ' . mysql_error() );
}

// Create database and select it
$con->query ( "CREATE DATABASE IF NOT EXISTS $dbName" );
$con->select_db ( "$dbName" );

// Check tables' existence
$existenceArr = array();
$existenceResult = mysql_query( "SHOW TABLES FROM $dbName" );
if ( $existenceResult ) {
      while( $existenceRow = mysql_fetch_row( $existenceResult ) ) {
            $existenceArr[] = $row[0];
      }
}
foreach ( $tables as $table => $sqlFile ) {
      echo "Checking table $table...";
      if ( in_array ( $table, $existenceArr ) ) {
            echo "table exists\n";
      } else {
            echo "not found; creating...";
            if ( !file_exists( $sqlFile ) ) {
                  die( "Error: file $sqlFile missing!\n" );
            }
            $sql = file_get_contents ( $sqlFile );
            $con->query ( $sql );
            echo "created\m";
      }
}

/* Setup my classes. */
include( 'botclasses.php' );
$wiki      = new wikipedia;
$wiki->url = "http://localhost/test13/w/api.php"; // Local mirror of WikiIndex

/* All the login stuff. */
$user = 'Nate';
$pass = 'password'; // Lax security because it's just a local development box
$wiki->login( $user, $pass );

// Configuration
$pageTitlesFile = 'CategoryMembers.txt';

// Test file existence
if ( !file_exists ( $pageTitlesFile ) ) {
    die ( "File $pageTitlesFile not found" );
}

// Read files
$lines = file( $pageTitlesFile, FILE_IGNORE_NEW_LINES );

$fields = array (
    'name',
    'URL',
    'logo',
    'wide logo',
#    'iw_prefix', // Hopefully WikiIndex will add these fields someday
#    'iw_url',
    'recentchanges URL',
    'wikinode URL',
    'status',
    'language',
    'editmode',
    'engine',
    'license',
    'maintopic',
    'backupurl',
    'backupdate',
    'pages',
    'statistics URL',
    'wikiFactor',
    'wikiFactor URL',
);

$startWith = "";
$reachedStartWith = false;
$errors = '';
// Parse those templates
foreach ( $lines as $line ) {
    if ( $reachedStartWith || !$startWith || $line == $startWith ) {
        $reachedStartWith = true;
        $contents = $wiki->getpage ( $line );
        // Clear the comments out of there
        while ( ($commentBegin = strpos ( $contents, '<!--' ) ) !== false ) {
            $commentEnd = strpos ( $contents, '-->', $commentBegin );
            if ( !$commentEnd ) {
                $commentEnd = strlen ( $contents ) - 1;
            }
            if ( $commentEnd ) {
                $contents = str_replace ( substr ( $contents, $commentBegin,
                    $commentEnd - $commentBegin + 3 ), '', $contents );
            }
        }
        $rowFields = array ();
        foreach ( $fields as $field ) {
            // A template line with data will start with a |, e.g. |name = MisesWiki
            if ( $begin = strpos ( $contents, '|' . $field ) ) {
                // Then of course there's an equal sign for each template argument
                if ( $equals = strpos ( $contents, '=', $begin ) ) {
                    $closingBrace = strpos ( $contents, "}}", $equals );
                    // End this string at the newline or the }} (closing the template), whichever
                    // comes first
                    $newline = strpos ( $contents, "\n", $equals );
                    if ( $closingBrace && $closingBrace < $newline ) {
                        $newline = $closingBrace;
                    }
                    if ( $newline ) {
                        $rowFields[$field] = trim( substr ( $contents, $equals + 1,
                            $newline - $equals - 1 ) );
                        // Nested templates will cause problems
                        if ( strpos ( $rowFields[$field], '{{' ) ) {
                            $errors .= "Error: Template included. Wiki: $line. Field: "
                                . $rowFields[$field] . "\n";
                        }
                    }
                }
            }
        }
        echo $contents . "\n";
        // Store data in database
        if ( $rowFields ) {
            $queryValues = $con->real_escape_string ( serialize ( $rowFields ) );
            $query = "INSERT INTO parsed_mirrored_wikiindex (pmw_wikiindex_page_title, pmw_data) "
                . "VALUES ('" . $con->real_escape_string ( $line ) . "', '$queryValues')";
            echo $query ."\n";
            $success = $con->query ( $query );
            echo $success;
            if ( !$success ) {
                $errors .= "Failure inserting $query\n";
            }
        }
    }
}
echo $errors; // If there were any errors along the way, break the bad news at the last minute

parsed-mirrored-wikiindex.sql

-- Stores data gathered from WikiIndex and 20,000 wiki APIs
CREATE TABLE parsed_mirrored_wikiindex(
        -- Primary key
        pmw_id INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
        -- WikiIndex page title, used because it's sure to be unique
        pmw_wikiindex_page_title VARCHAR(255) BINARY,
        -- Serialized data from WikiIndex pages and 20,000 wiki APIs
        pmw_data mediumblob
);