Example of web-scraping using PHP which downloads PDF file(s)

<?php
//It downloads MIT OpenCourseWare PDF file(s) from https://ocw.mit.edu 
//How to execute the current script? 
//Make sure your internet connection is on.
//Step 1: Install XAMPP server and start Apache and Mysql tasks from the control panel
//Step 2: Go to htdocs folder and create a folder PDF.
//Step 3: Copy simple_html_dom.php to htdocs folder
//Step 4: Copy this file (download_all_pdf_files.php) to htdocs folder.
//Step 5: Open browser and type "http://localhost/download_all_pdf_files.php"
//Step 6: All the pdf files will be saved in PDF folder under htdocs


include "simple_html_dom.php";
//PHP DOM html parser
//Please download the complete project from here:
//https://sourceforge.net/projects/simplehtmldom/
//The above file must be included from the project and must be present where the script is executed
//It must be present in htdocs folder while running this script.


function download_file_from_url($url, $path) 
 {
   $new_file_name = $path;
   $file = fopen ($url, "rb");
   if ($file) 
     {
        $newf = fopen ($new_file_name, "wb");
        if ($newf)
        while(!feof($file)) {
           fwrite($newf, fread($file, 1024 * 8 ), 1024 * 8 );
        }
     }
     if ($file) {fclose($file);}
     if ($newf) {fclose($newf);}
 } 


function get_all_pdf_links($website_url)
{
    $html = file_get_html($website_url);
    $all_ahref_links = array();
    $link_counter=0;

    //Find all links <a href
    foreach($html->find('a') as $element)
      {
        $all_ahref_links[$link_counter++] = $element->href;
      }

    $pdf_links_list = array();
    $pdf_link_count = 0;
    $total_links = count($all_ahref_links);
    for ($link_counter=0;$link_counter<$total_links;$link_counter++)
    {
        if (strpos($all_ahref_links[$link_counter], '.pdf') !== false) 
        {
            $pdf_links_list[$pdf_link_count++] = $all_ahref_links[$link_counter];
        }
    }
    return $pdf_links_list;
}

//Change this URL to your target web page as per your requirement
$target_url = 'https://ocw.mit.edu/courses/mathematics/18-440-probability-and-random-variables-spring-2014/lecture-notes/';
$pdf_links_array = get_all_pdf_links($target_url);

//download all pdf file(s)
$pdf_counter = 0;
for ($pdf_counter=0;$pdf_counter<count($pdf_links_array);$pdf_counter++)
{
  //Change here base URL to your target as per your requirement
  $complete_pdf_url = "https://ocw.mit.edu".$pdf_links_array[$pdf_counter];
  $pdf_path_file_names_token = explode("/",$pdf_links_array[$pdf_counter]);
  $pdf_name = "PDF/".$pdf_path_file_names_token[count($pdf_path_file_names_token)-1];
  echo "<br>Downloading from...".$complete_pdf_url."...to...".$pdf_name."...";
  download_file_from_url($complete_pdf_url,$pdf_name);
}
exit;
?>

0 comments:

Post a Comment