Manual:Chris G's botclasses/DownloadAllImagesBot.php

From Linux Web Expert

The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

This bot uses Chris G's botclasses to download all images from a wiki.

<?php
/* DownloadAllImagesBot.php
 * By Leucosticte, https://www.mediawiki.org/wiki/User:Leucosticte
 * GNU Public License 2.0
 *
 * This bot downloads all images from a wiki.
 */

/* Setup my classes. */
include( 'botclasses.php' );
$wiki      = new wikipedia;
$wiki->url = "http://en.wikipedia.org/w/api.php";

/* All the login stuff. */
$user = 'REMOVED';
$pass = 'REMOVED';
$wiki->login( $user, $pass );

$dir = "./downloadfiles";
// Create directory if it doesn't exist
if ( !file_exists( $dir ) ) {
    echo "Creating directory $dir...\n";
    mkdir ( $dir );
}
if ( !is_dir( $dir ) ) {
    die ( "$dir is not a directory\n" );
}

// $done = false means that there still are more images left to come
$done = false;

// Initialize the cURL session
$ch = curl_init();

// This corresponds to the API:AllPages parameter "aifrom" which tells it with what page
// title at which to start listing image titles.
$aifrom = '';

// Keep going until it's evident that there are no more images
while ( !$done ) {
        // Start preparing an API query to tell the API: Put the list in PHP format; get 500 image titles
        // at a time; get the urls for the images; sort the list in ascending order.
        $query = "?action=query&format=php&list=allimages&ailimit=500&aiprop=url&aidir=ascending";
        if ( $aifrom ) {
                $query .= "&aifrom=$aifrom";
        }
        // Get the result of the API query.
        $ret = $wiki->query ( $query );
        // If the result doesn't tell us at what page title to start our next query, then that means this
        // is the end of the images.
        if ( !isset ( $ret['query-continue'] ) ) {
                $done = true;
        } else {
                // The result array has two parts, query and query-continue; this second part tells us where to
                // pick up where we left off
                $aifrom = $ret['query-continue']['allimages']['aicontinue'];
        }
        // Loop through that array of 500 image urls and download them all
        foreach ( $ret['query']['allimages'] as $element ) {
                // Save images in the directory
                $filename = "$dir/" . $element['name'];
                // If the file already exists, don't save it again
                if ( !file_exists ( $filename ) ) {
                        $f = fopen($filename, "w");
                        // For information on what this does, see
                        // https://www.php.net/function.curl-setopt
                        curl_setopt($ch, CURLOPT_URL, $element['url']);
                        curl_setopt($ch, CURLOPT_HEADER, 0);
                        curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE );
                        $g = curl_exec( $ch );
                        fwrite($f, $g);
                        fclose($f);
                }
        }
}