Manual:Chris G's botclasses/ExportAllPagesBot.php

From Linux Web Expert

Revision as of 17:05, 20 March 2024 by imported>Pppery (alt) (Replace source with syntaxhighlight, replaced: <source → <syntaxhighlight (3), </source> → </syntaxhighlight> (3))
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

This bot uses Chris G's botclasses to export all pages from a wiki and store them in XML files. Run AllPagesBot.php to generate the text files needed for this bot to work.

Hack

To make it work, you'll need to hack botclasses.php so that http is a public rather than private variable. So, change:

class wikipedia {
    private $http;
    private $token;
    private $ecTimestamp;
    public $url;

to:

class wikipedia {
    public $http;
    private $token;
    private $ecTimestamp;
    public $url;

Code

<?php
/* ExportAllPagesBot
 * By Leucosticte, https://www.mediawiki.org/wiki/User:Leucosticte
 * GNU Public License 2.0
 *
 * This bot exports all pages from a wiki and stores them in XML files.
 */

/* Setup my classes. */
include( 'botclasses.php' );
$wiki      = new wikipedia;
$wiki->url = "http://en.wikipedia.org/w/api.php";

/* All the login stuff. */
$user = 'REMOVED';
$pass = 'REMOVED';
$wiki->login( $user, $pass );

// Configuration
$dir = "./xmlfiles";
$filePrefix = "$dir/Wikipedia-";
$filePrefixNs6 = "$dir/Wikipedia-Ns6-";
$url1 = "http://en.wikipedia.org/w/index.php?title=Special:Export&pages=";
$url2 = '';
$url3 = "&history&action=submit"; // Get the full history
$pageTitlesFile = 'PageTitles.txt';
$pageTitlesNs6File = 'PageTitlesNs6.txt';
$pagesPerFile = 100; // Put this many pages in each XML file

// Create directory if it doesn't exist
if ( !file_exists( $dir ) ) {
    echo "Creating directory $dir...\n";
    mkdir ( $dir );
}
if ( !is_dir( $dir ) ) {
    die ( "$dir is not a directory\n" );
}

// Test file existence
if ( !file_exists ( $pageTitlesFile ) ) {
    die ( "File $pageTitlesFile not found" );
}
if ( !file_exists ( $pageTitlesNs6File ) ) {
    die ( "File $pageTitlesNs6File not found" );
}

// Read files
$lines = file( $pageTitlesFile, FILE_IGNORE_NEW_LINES );
$linesNs6 = file( $pageTitlesNs6File, FILE_IGNORE_NEW_LINES );

// Iterate over other namespaces, then over the file namespace
iterate( $wiki, $lines, $filePrefix, $pagesPerFile, $url1, $url2, $url3 );
iterate( $wiki, $linesNs6, $filePrefixNs6, $pagesPerFile, $url1, $url2, $url3 );

function iterate( $wiki, $lines, $filePrefix, $pagesPerFile, $url1, $url2, $url3 ) {
    // FIXME: Something is wrong with this formula; so far it hasn't been a big deal, but the
    // file numbering isn't working as desired/expected
    $iterations = count ( $lines ) / $pagesPerFile;
    $digits = strlen ( (string)$iterations );
    $fileNumber = 0; // Incrementing number appended to the filename

    foreach ( $lines as $key => $line ) {
        if ( $url2 ) {
            $url2 .= "%0A";
        }
        $url2 .= urlencode ( $line );
        if ( !( ( $key + 1 ) % $pagesPerFile )
            || $key + 1 == count ( $lines ) ) { // If it divides evenly or this is the last one...
            $fileNumber++; // This is part of the filename
            $fileNumberDigits = strlen ( (string)$fileNumber ); // Number of digits in file name
            $strFileNumber = str_repeat ( '0', $digits - $fileNumberDigits )
                . $fileNumber; // Add leading zeros
            $filename = $filePrefix . $strFileNumber . ".xml";
            $url = $url1 . $url2 . $url3;
            echo "Creating file $filename...\n";
            $f = fopen( $filename, "w" );
            $g = $wiki->http->post( $url, true );
            fwrite( $f, $g );
            fclose( $f );
            $url2 = '';
        }
    }
}