borobhaisab | The Blog Pros

May 28, 2023

Bing AI Fixed My Crawler

@dani

I got ChatGpt to fix my Crawler that was showing error.
This code that was showing error:

My Buggy Code

<?php

//START OF SCRIPT FLOW.

//Preparing Crawler & Session: Initialising Variables.

//Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only.
//SiteMaps Details Scraped from SiteMaps or Xml Files.
$sitemaps  = []; //This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml).
$sitemaps_last_mods  = []; //This will list dates of SiteMap pages last modified - found on Sitemaps.
$sitemaps_change_freqs  = []; //his will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps.
$sitemaps_priorities  = []; //This will list SiteMap pages priorities - found on Sitemaps.

//Webpage Details Scraped from SiteMaps or Xml Files.
$html_page_urls  = []; //This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml).
$html_page_last_mods  = []; //This will list dates of html pages last modified - found on Sitemap.
$html_page_change_freqs  = []; //his will list dates of html pages frequencies of page updates - found on Sitemaps.
$html_page_priorities  = []; //This will list html pages priorities - found on Sitemaps.

//Preparing $ARRAYS For Step 2: To Deal with html pages meant for Human Visitors only.
//Data Scraped from Html Files. Not Xml SiteMap Files.
$html_page_meta_names  = []; //This will list crawled pages Meta Tag Names - found on html pages.
$html_page_meta_descriptions  = []; //This will list crawled pages Meta Tag Descriptions - found on html pages.
$html_page_titles  = []; //This will list crawled pages Titles - found on html pages.
// -----

//Step 1: Initiate Session - Feed Xml SiteMap Url. Crawing Starting Point.
//Crawl Session Starting Page/Initial Xml Sitemap. (NOTE: Has to be .xml SItemap).
//$initial_url = "https://www.rocktherankings.com/sitemap_index.xml"; //Has more xml files.
$initial_url = "http://localhost/Work/buzz/Templates/0.xml";

//$xmls = file_get_contents($initial_url); //Should I stick to this line or below line ?
//Parse the sitemap content to object
//$xml = simplexml_load_string($xmls); //Should I stick to this line or above line ?
$xml = simplexml_load_string(file_get_contents($initial_url)); //Code from Dani: https://www.daniweb.com/programming/web-development/threads/540168/what-to-lookout-for-to-prevent-crawler-traps

$dom = new DOMDocument();
$dom->loadXML($xml); //LINE: 44z
//$result = @$dom->loadXML($xml); //LINE: 44

echo __LINE__; echo '<br>'; //LINE: 46

extract_links($xml);

echo __LINE__; echo '<br>';  //LINE: 50

foreach($sitemaps AS $sitemap)
{
    echo __LINE__; echo '<br>';
    extract_links($sitemap); //Extract Links on page.
}

foreach($html_page_urls AS $html_page_url)
{
    echo __LINE__; echo '<br>';
    $scrape_page_data($html_page_url); //Extract Links on page.
}

//END OF SCRIPT FLOW.

//DUNCTIONS BEYOND THIS POINT.

//Links Extractor.
function extract_links()
{
    echo __LINE__; echo '<br>';  //LINE: 73

    GLOBAL $dom;
    //Trigger following IF/ELSEs on each Crawled Page to check for link types. Whether Links lead to more SiteMaps (.xml) or webpages (.html, .htm, .php, etc.).
    if ($dom->nodeName === 'sitemapindex')  //Current Xml SiteMap Page lists more Xml SiteMaps. Lists links to Xml links. Not lists links to html links.
    {
        echo __LINE__; echo '<br>';

        //parse the index
        // retrieve properties from the sitemap object
        foreach ($xml->sitemapindex as $urlElement) //Extracts xml file urls.
        {
            // get properties
            $sitemaps[] = $sitemap_url = $urlElement->loc;
            $sitemaps_last_mods[] = $last_mod = $urlElement->lastmod;
            $sitemaps_change_freqs[] = $change_freq = $urlElement->changefreq;
            $sitemaps_priorities[] = $priority = $urlElement->priority;

            // print out the properties
            echo 'url: '. $sitemap_url . '<br>';
            echo 'lastmod: '. $last_mod . '<br>';
            echo 'changefreq: '. $change_freq . '<br>';
            echo 'priority: '. $priority . '<br>';

            echo '<br>---<br>';
        }
    } 
    else if ($dom->nodeName === 'urlset')  //Current Xml SiteMap Page lists no more Xml SiteMap links. Lists only html links.
    {
        echo __LINE__; echo '<br>';

        //parse url set
        // retrieve properties from the sitemap object
        foreach ($xml->urlset as $urlElement) //Extracts Sitemap Urls.
        {
            // get properties
            $html_page_urls[] = $html_page_url = $urlElement->loc;
            $html_page_last_mods[] = $last_mod = $urlElement->lastmod;
            $html_page_change_freqs[] = $change_freq = $urlElement->changefreq;
            $html_page_priorities[] = $priority = $urlElement->priority;

            // print out the properties
            echo 'url: '. $html_page_url . '<br>';
            echo 'lastmod: '. $last_mod . '<br>';
            echo 'changefreq: '. $change_freq . '<br>';
            echo 'priority: '. $priority . '<br>';

            echo '<br>---<br>';
        }
    } 

    GLOBAL $sitemaps;
    GLOBAL $sitemaps_last_mods;
    GLOBAL $sitemaps_change_freqs;
    GLOBAL $sitemaps_priorities;

    GLOBAL $html_page_urls;
    GLOBAL $html_page_last_mods;
    GLOBAL $html_page_change_freqs;
    GLOBAL $html_page_priorities;

    echo 'SiteMaps Crawled: ---'; echo '<br><br>'; 
    if(array_count_values($sitemaps)>0)
    {   
        print_r($sitemaps);
        echo '<br>';
    }
    elseif(array_count_values($sitemaps_last_mods)>0)
    {   
        print_r($sitemaps_last_mods);
        echo '<br>';
    }
    elseif(array_count_values($sitemaps_change_freqs)>0)
    {   
        print_r($sitemaps_change_freqs);
        echo '<br>';
    }
    elseif(array_count_values($sitemaps_priorities)>0)
    {   
        print_r($sitemaps_priorities);
        echo '<br><br>'; 
    }

    echo 'Html Pages Crawled: ---'; echo '<br><br>'; 

    if(array_count_values($html_page_urls)>0)
    {   
        print_r($html_page_urls);
        echo '<br>';
    }
    if(array_count_values($html_page_last_mods)>0)
    {   
        print_r($html_page_last_mods);
        echo '<br>';
    }
    if(array_count_values($html_page_change_freqs)>0)
    {   
        print_r($html_page_change_freqs);
        echo '<br>';
    }
    if(array_count_values($html_page_priorities)>0)
    {   
        print_r($html_page_priorities);
        echo '<br>';
    }
}

//Meta Data & Title Extractor.
function scrape_page_data()
{
    GLOBAL $html_page_urls;
    if(array_count_values($html_page_urls)>0)
    {       
        foreach($html_page_urls AS $url)
        {
            // https://www.php.net/manual/en/function.file-get-contents
            $html = file_get_contents($url);

            //https://www.php.net/manual/en/domdocument.construct.php
            $doc = new DOMDocument();

            // https://www.php.net/manual/en/function.libxml-use-internal-errors.php
            libxml_use_internal_errors(true);

            // https://www.php.net/manual/en/domdocument.loadhtml.php
            $doc->loadHTML($html, LIBXML_COMPACT|LIBXML_NOERROR|LIBXML_NOWARNING);

            // https://www.php.net/manual/en/function.libxml-clear-errors.php
            libxml_clear_errors();

            // https://www.php.net/manual/en/domdocument.getelementsbytagname.php
            $meta_tags = $doc->getElementsByTagName('meta');

            // https://www.php.net/manual/en/domnodelist.item.php
            if ($meta_tags->length > 0)
            {
                // https://www.php.net/manual/en/class.domnodelist.php
                foreach ($meta_tags as $tag)
                {
                    // https://www.php.net/manual/en/domnodelist.item.php
                    echo 'Meta Name: ' .$meta_name = $tag->getAttribute('name'); echo '<br>';
                    echo 'Meta Content: ' .$meta_content = $tag->getAttribute('content');  echo '<br>';
                    $html_page_meta_names[] = $meta_name;
                    $html_page_meta_descriptions[] = $meta_content;
                }
            }

            //EXAMPLE 1: Extract Title
            $title_tag = $doc->getElementsByTagName('title');
            if ($title_tag->length>0)
            {
                echo 'Title: ' .$title = $title_tag[0]->textContent; echo '<br>';
                $html_page_titles[] = $title;
            }

            //EXAMPLE 2: Extract Title
            $title_tag = $doc->getElementsByTagName('title');

            for ($i = 0; $i < $title_tag->length; $i++) {
                echo 'Title: ' .$title = $title_tag->item($i)->nodeValue . "\n";
                $html_page_titles[] = $title;
            }
        }
    }
}

if(array_count_values($html_page_meta_names)>0)
{   
    print_r($html_page_meta_names);
    echo '<br>';
}

if(array_count_values($html_page_meta_descriptions)>0)
{   
    print_r($html_page_meta_descriptions);
    echo '<br>';
}

if(array_count_values($html_page_titles)>0)
{   
    print_r($html_page_titles);
    echo '<br>';
}

//END OF FUNCTIONS.

Bing AI Fix v1:

<?php
ini_set('display_errors', 1);
ini_set('display_startup_errors', 1);
error_reporting(E_ALL);

//No Page Loading Time-Out feature.
//No Page Loading Status Codes Feature.

// Preparing Crawler & Session: Initializing Variables.

// Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only.
// SiteMaps Details Scraped from SiteMaps or Xml Files.
$sitemaps = []; // This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml).
$sitemaps_last_mods = []; // This will list dates of SiteMap pages last modified - found on Sitemaps.
$sitemaps_change_freqs = []; // This will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps.
$sitemaps_priorities = []; // This will list SiteMap pages priorities - found on Sitemaps.

// Webpage Details Scraped from SiteMaps or Xml Files.
$html_page_urls = []; // This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml).
$html_page_last_mods = []; // This will list dates of html pages last modified - found on Sitemap.
$html_page_change_freqs = []; // This will list dates of html pages frequencies of page updates - found on Sitemaps.
$html_page_priorities = []; // This will list html pages priorities - found on Sitemaps.

// Preparing $ARRAYS For Step 2: To Deal with html pages meant for Human Visitors only.
// Data Scraped from Html Files. Not Xml SiteMap Files.
$html_page_meta_names = []; // This will list crawled pages Meta Tag Names - found on html pages.
$html_page_meta_descriptions = []; // This will list crawled pages Meta Tag Descriptions - found on html pages.
$html_page_titles = []; // This will list crawled pages Titles - found on html pages.

// Crawl Session Starting Page/Initial Xml Sitemap. (NOTE: Has to be .xml Sitemap).
$initial_url = "http://localhost/Work/buzz/Templates/0.xml";

// Call the extract_links function with the initial sitemap URL
extract_links($initial_url);

// Links Extractor.
function extract_links($url)
{
    global $sitemaps, $sitemaps_last_mods, $sitemaps_change_freqs, $sitemaps_priorities;
    global $html_page_urls, $html_page_last_mods, $html_page_change_freqs, $html_page_priorities;

    // Load the sitemap content
    $xml = simplexml_load_string(file_get_contents($url));

    // Check if the sitemap contains other sitemaps
    if (isset($xml->sitemap)) {
        // Loop through the sitemaps
        foreach ($xml->sitemap as $sitemap) {
            // Recursively call the extract_links function on the sitemap URL
            extract_links($sitemap->loc);

            // Store information about the sitemap
            $sitemaps[] = (string)$sitemap->loc;
            if (isset($sitemap->lastmod)) {
                $sitemaps_last_mods[] = (string)$sitemap->lastmod;
            }
            if (isset($sitemap->changefreq)) {
                $sitemaps_change_freqs[] = (string)$sitemap->changefreq;
            }
            if (isset($sitemap->priority)) {
                $sitemaps_priorities[] = (string)$sitemap->priority;
            }
        }
    } else if (isset($xml->url)) {
        // Loop through the URLs in the sitemap
        foreach ($xml->url as $url) {
            // Store information about the URL
            $html_page_urls[] = (string)$url->loc;
            if (isset($url->lastmod)) {
                $html_page_last_mods[] = (string)$url->lastmod;
            }
            if (isset($url->changefreq)) {
                $html_page_change_freqs[] = (string)$url->changefreq;
            }
            if (isset($url->priority)) {
                $html_page_priorities[] = (string)$url->priority;
            }
        }
    }
}

// Scrape meta data from HTML pages
foreach ($html_page_urls as $url) {
    scrape_page_data($url);
}

// Meta Data & Title Extractor.
function scrape_page_data($url)
{
    global $html_page_meta_names, $html_page_meta_descriptions, $html_page_titles;

    // Load the HTML content
    @$doc = new DOMDocument();
    @$doc->loadHTML(file_get_contents($url));

    // Extract meta tags
    foreach ($doc->getElementsByTagName('meta') as $meta) {
        if ($meta->hasAttribute('name') && $meta->hasAttribute('content')) {
            $name = strtolower($meta->getAttribute('name'));
            if ($name === 'description') {
                $html_page_meta_descriptions[] = trim($meta->getAttribute('content'));
            } else {
                $html_page_meta_names[] = trim($meta->getAttribute('content'));
            }
        }
    }

    // Extract title tag
    foreach ($doc->getElementsByTagName('title') as $title) {
        $html_page_titles[] = trim($title->nodeValue);
    }
}

// Print out the extracted data in a more readable format
echo "<pre>";
echo "SITEMAPS:\n";
echo print_r($sitemaps, true);
echo "SITEMAPS LAST MODS:\n";
echo print_r($sitemaps_last_mods, true);
echo "SITEMAPS CHANGE FREQS:\n";
echo print_r($sitemaps_change_freqs, true);
echo "SITEMAPS PRIORITIES:\n";
echo print_r($sitemaps_priorities, true);

echo "HTML PAGE URLS:\n";
echo print_r($html_page_urls, true);
echo "HTML PAGE LAST MODS:\n";
echo print_r($html_page_last_mods, true);
echo "HTML PAGE CHANGE FREQS:\n";
echo print_r($html_page_change_freqs, true);
echo "HTML PAGE PRIORITIES:\n";
echo print_r($html_page_priorities, true);

echo "HTML PAGE META NAMES:\n";
echo print_r($html_page_meta_names, true);
echo "HTML PAGE META DESCRIPTIONS:\n";
echo print_r($html_page_meta_descriptions, true);
echo "HTML PAGE TITLES:\n";
echo print_r($html_page_titles, true);
echo "</pre>";

I told Bing AI to add Time-Out. Crawler v2:

<?php
ini_set('display_errors', 1);
ini_set('display_startup_errors', 1);
error_reporting(E_ALL);

// Set the timeout value in seconds
$timeout = 10;

// Preparing Crawler & Session: Initializing Variables.

// Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only.
// SiteMaps Details Scraped from SiteMaps or Xml Files.
$sitemaps = []; // This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml).
$sitemaps_last_mods = []; // This will list dates of SiteMap pages last modified - found on Sitemaps.
$sitemaps_change_freqs = []; // This will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps.
$sitemaps_priorities = []; // This will list SiteMap pages priorities - found on Sitemaps.

// Webpage Details Scraped from SiteMaps or Xml Files.
$html_page_urls = []; // This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml).
$html_page_last_mods = []; // This will list dates of html pages last modified - found on Sitemap.
$html_page_change_freqs = []; // This will list dates of html pages frequencies of page updates - found on Sitemaps.
$html_page_priorities = []; // This will list html pages priorities - found on Sitemaps.

// Preparing $ARRAYS For Step 2: To Deal with html pages meant for Human Visitors only.
// Data Scraped from Html Files. Not Xml SiteMap Files.
$html_page_meta_names = []; // This will list crawled pages Meta Tag Names - found on html pages.
$html_page_meta_descriptions = []; // This will list crawled pages Meta Tag Descriptions - found on html pages.
$html_page_titles = []; // This will list crawled pages Titles - found on html pages.

// Crawl Session Starting Page/Initial Xml Sitemap. (NOTE: Has to be .xml Sitemap).
$initial_url = "http://localhost/Work/buzz/Templates/0.xml";

// Call the extract_links function with the initial sitemap URL
extract_links($initial_url);

// Links Extractor.
function extract_links($url)
{
    global $timeout;
    global $sitemaps, $sitemaps_last_mods, $sitemaps_change_freqs, $sitemaps_priorities;
    global $html_page_urls, $html_page_last_mods, $html_page_change_freqs, $html_page_priorities;

    // Create a stream context with the timeout option
    $context = stream_context_create([
        'http' => [
            'timeout' => $timeout
        ]
    ]);

    // Load the sitemap content
    $xml = simplexml_load_string(file_get_contents($url, false, $context));

    // Check if the sitemap contains other sitemaps
    if (isset($xml->sitemap)) {
        // Loop through the sitemaps
        foreach ($xml->sitemap as $sitemap) {
            // Recursively call the extract_links function on the sitemap URL
            extract_links($sitemap->loc);

            // Store information about the sitemap
            $sitemaps[] = (string)$sitemap->loc;
            if (isset($sitemap->lastmod)) {
                $sitemaps_last_mods[] = (string)$sitemap->lastmod;
            }
            if (isset($sitemap->changefreq)) {
                $sitemaps_change_freqs[] = (string)$sitemap->changefreq;
            }
            if (isset($sitemap->priority)) {
                $sitemaps_priorities[] = (string)$sitemap->priority;
            }
        }
    } else if (isset($xml->url)) {
        // Loop through the URLs in the sitemap
        foreach ($xml->url as $url) {
            // Store information about the URL
            $html_page_urls[] = (string)$url->loc;
            if (isset($url->lastmod)) {
                $html_page_last_mods[] = (string)$url->lastmod;
            }
            if (isset($url->changefreq)) {
                $html_page_change_freqs[] = (string)$url->changefreq;
            }
            if (isset($url->priority)) {
                $html_page_priorities[] = (string)$url->priority;
            }
        }
    }
}

// Scrape meta data from HTML pages
foreach ($html_page_urls as $url) {
    scrape_page_data($url);
}

// Meta Data & Title Extractor.
function scrape_page_data($url)
{
    global $html_page_meta_names, $html_page_meta_descriptions, $html_page_titles;

    // Load the HTML content
    @$doc = new DOMDocument();
    @$doc->loadHTML(file_get_contents($url));

    // Extract meta tags
    foreach ($doc->getElementsByTagName('meta') as $meta) {
        if ($meta->hasAttribute('name') && $meta->hasAttribute('content')) {
            $name = strtolower($meta->getAttribute('name'));
            if ($name === 'description') {
                $html_page_meta_descriptions[] = trim($meta->getAttribute('content'));
            } else {
                $html_page_meta_names[] = trim($meta->getAttribute('content'));
            }
        }
    }

    // Extract title tag
    foreach ($doc->getElementsByTagName('title') as $title) {
        $html_page_titles[] = trim($title->nodeValue);
    }
}

// Print out the extracted data in a more readable format
echo "<pre>";
echo "SITEMAPS:\n";
echo print_r($sitemaps, true);
echo "SITEMAPS LAST MODS:\n";
echo print_r($sitemaps_last_mods, true);
echo "SITEMAPS CHANGE FREQS:\n";
echo print_r($sitemaps_change_freqs, true);
echo "SITEMAPS PRIORITIES:\n";
echo print_r($sitemaps_priorities, true);

echo "HTML PAGE URLS:\n";
echo print_r($html_page_urls, true);
echo "HTML PAGE LAST MODS:\n";
echo print_r($html_page_last_mods, true);
echo "HTML PAGE CHANGE FREQS:\n";
echo print_r($html_page_change_freqs, true);
echo "HTML PAGE PRIORITIES:\n";
echo print_r($html_page_priorities, true);

echo "HTML PAGE META NAMES:\n";
echo print_r($html_page_meta_names, true);
echo "HTML PAGE META DESCRIPTIONS:\n";
echo print_r($html_page_meta_descriptions, true);
echo "HTML PAGE TITLES:\n";
echo print_r($html_page_titles, true);
echo "</pre>";

I told Bing AI to add Status Codes for 4xx & 5xx ranges. Crawler v3a:

<?php
ini_set('display_errors', 1);
ini_set('display_startup_errors', 1);
error_reporting(E_ALL);

// Set the timeout value in seconds
$timeout = 10;

// Preparing Crawler & Session: Initializing Variables.

// Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only.
// SiteMaps Details Scraped from SiteMaps or Xml Files.
$sitemaps = []; // This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml).
$sitemaps_last_mods = []; // This will list dates of SiteMap pages last modified - found on Sitemaps.
$sitemaps_change_freqs = []; // This will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps.
$sitemaps_priorities = []; // This will list SiteMap pages priorities - found on Sitemaps.

// Webpage Details Scraped from SiteMaps or Xml Files.
$html_page_urls = []; // This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml).
$html_page_last_mods = []; // This will list dates of html pages last modified - found on Sitemap.
$html_page_change_freqs = []; // This will list dates of html pages frequencies of page updates - found on Sitemaps.
$html_page_priorities = []; // This will list html pages priorities - found on Sitemaps.

// Preparing $ARRAYS For Step 2: To Deal with html pages meant for Human Visitors only.
// Data Scraped from Html Files. Not Xml SiteMap Files.
$html_page_meta_names = []; // This will list crawled pages Meta Tag Names - found on html pages.
$html_page_meta_descriptions = []; // This will list crawled pages Meta Tag Descriptions - found on html pages.
$html_page_titles = []; // This will list crawled pages Titles - found on html pages.

// Crawl Session Starting Page/Initial Xml Sitemap. (NOTE: Has to be .xml Sitemap).
$initial_url = "http://localhost/Work/buzz/Templates/0.xml";

// Call the extract_links function with the initial sitemap URL
extract_links($initial_url);

// Links Extractor.
function extract_links($url)
{
    global $timeout;
    global $sitemaps, $sitemaps_last_mods, $sitemaps_change_freqs, $sitemaps_priorities;
    global $html_page_urls, $html_page_last_mods, $html_page_change_freqs, $html_page_priorities;

    // Check the status code of the URL
    $status_code = get_status_code($url);
    if ($status_code >= 400) {
        echo "Error: Unable to process URL $url (status code: $status_code)\n";
        return;
    }

    // Create a stream context with the timeout option
    $context = stream_context_create([
        'http' => [
            'timeout' => $timeout
        ]
    ]);

    // Load the sitemap content
    $xml = simplexml_load_string(file_get_contents($url, false, $context));

    // Check if the sitemap contains other sitemaps
    if (isset($xml->sitemap)) {
        // Loop through the sitemaps
        foreach ($xml->sitemap as $sitemap) {
            // Recursively call the extract_links function on the sitemap URL
            extract_links($sitemap->loc);

            // Store information about the sitemap
            $sitemaps[] = (string)$sitemap->loc;
            if (isset($sitemap->lastmod)) {
                $sitemaps_last_mods[] = (string)$sitemap->lastmod;
            }
            if (isset($sitemap->changefreq)) {
                $sitemaps_change_freqs[] = (string)$sitemap->changefreq;
            }
            if (isset($sitemap->priority)) {
                $sitemaps_priorities[] = (string)$sitemap->priority;
            }
        }
    } else if (isset($xml->url)) {
        // Loop through the URLs in the sitemap
        foreach ($xml->url as $url) {
            // Store information about the URL
            $html_page_urls[] = (string)$url->loc;
            if (isset($url->lastmod)) {
                $html_page_last_mods[] = (string)$url->lastmod;
            }
            if (isset($url->changefreq)) {
                $html_page_change_freqs[] = (string)$url->changefreq;
            }
            if (isset($url->priority)) {
                $html_page_priorities[] = (string)$url->priority;
            }
        }
    }
}

// Scrape meta data from HTML pages
foreach ($html_page_urls as $url) {
    scrape_page_data($url);
}

// Meta Data & Title Extractor.
function scrape_page_data($url)
{
    global $html_page_meta_names, $html_page_meta_descriptions, $html_page_titles;

    // Check the status code of the URL
    $status_code = get_status_code($url);
    if ($status_code >= 400) {
        echo "Error: Unable to process URL $url (status code: $status_code)\n";
        return;
    }

    // Load the HTML content
    @$doc = new DOMDocument();
    @$doc->loadHTML(file_get_contents($url));

    // Extract meta tags
    foreach ($doc->getElementsByTagName('meta') as $meta) {
        if ($meta->hasAttribute('name') && $meta->hasAttribute('content')) {
            $name = strtolower($meta->getAttribute('name'));
            if ($name === 'description') {
                $html_page_meta_descriptions[] = trim($meta->getAttribute('content'));
            } else {
                $html_page_meta_names[] = trim($meta->getAttribute('content'));
            }
        }
    }

    // Extract title tag
    foreach ($doc->getElementsByTagName('title') as $title) {
        $html_page_titles[] = trim($title->nodeValue);
    }
}

// Get HTTP status code for a given URL
function get_status_code($url)
{
    global $timeout;

    // Create a stream context with the timeout option
    $context = stream_context_create([
        'http' => [
            'timeout' => $timeout,
            'method' => 'HEAD'
        ]
    ]);

    // Send a HEAD request to get only headers and not body content
    @$headers = get_headers($url, 0, $context);

    // Check if headers were returned
    if ($headers === false) {
        return false;
    }

    // Get the first header line (status line)
    $status_line = $headers[0];

    // Extract the status code from the status line
    list(, $status_code) = explode(' ', $status_line);

    // Return the status code
    return (int)$status_code;
}

// Print out the extracted data in a more readable format
echo "<pre>";
echo "SITEMAPS:\n";
echo print_r($sitemaps, true);
echo "SITEMAPS LAST MODS:\n";
echo print_r($sitemaps_last_mods, true);
echo "SITEMAPS CHANGE FREQS:\n";
echo print_r($sitemaps_change_freqs, true);
echo "SITEMAPS PRIORITIES:\n";
echo print_r($sitemaps_priorities, true);

echo "HTML PAGE URLS:\n";
echo print_r($html_page_urls, true);
echo "HTML PAGE LAST MODS:\n";
echo print_r($html_page_last_mods, true);
echo "HTML PAGE CHANGE FREQS:\n";
echo print_r($html_page_change_freqs, true);
echo "HTML PAGE PRIORITIES:\n";
echo print_r($html_page_priorities, true);

echo "HTML PAGE META NAMES:\n";
echo print_r($html_page_meta_names, true);
echo "HTML PAGE META DESCRIPTIONS:\n";
echo print_r($html_page_meta_descriptions, true);
echo "HTML PAGE TITLES:\n";
echo print_r($html_page_titles, true);
echo "</pre>";

?>

I told Bing AI to add Status Codes for 4xx & 5xx ranges. Crawler v3b:

<?php
ini_set('display_errors', 1);
ini_set('display_startup_errors', 1);
error_reporting(E_ALL);

// Set the timeout value in seconds
$timeout = 10;

// Preparing Crawler & Session: Initializing Variables.

// Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only.
// SiteMaps Details Scraped from SiteMaps or Xml Files.
$sitemaps = []; // This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml).
$sitemaps_last_mods = []; // This will list dates of SiteMap pages last modified - found on Sitemaps.
$sitemaps_change_freqs = []; // This will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps.
$sitemaps_priorities = []; // This will list SiteMap pages priorities - found on Sitemaps.

// Webpage Details Scraped from SiteMaps or Xml Files.
$html_page_urls = []; // This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml).
$html_page_last_mods = []; // This will list dates of html pages last modified - found on Sitemap.
$html_page_change_freqs = []; // This will list dates of html pages frequencies of page updates - found on Sitemaps.
$html_page_priorities = []; // This will list html pages priorities - found on Sitemaps.

// Preparing $ARRAYS For Step 2: To Deal with html pages meant for Human Visitors only.
// Data Scraped from Html Files. Not Xml SiteMap Files.
$html_page_meta_names = []; // This will list crawled pages Meta Tag Names - found on html pages.
$html_page_meta_descriptions = []; // This will list crawled pages Meta Tag Descriptions - found on html pages.
$html_page_titles = []; // This will list crawled pages Titles - found on html pages.

// Crawl Session Starting Page/Initial Xml Sitemap. (NOTE: Has to be .xml Sitemap).
$initial_url = "http://localhost/Work/buzz/Templates/0.xml";

// Call the extract_links function with the initial sitemap URL
extract_links($initial_url);

// Links Extractor.
function extract_links($url)
{
    global $timeout;
    global $sitemaps, $sitemaps_last_mods, $sitemaps_change_freqs, $sitemaps_priorities;
    global $html_page_urls, $html_page_last_mods, $html_page_change_freqs, $html_page_priorities;

    // Check the status code of the URL
    $status_code = get_status_code($url);
    if ($status_code >= 400 && $status_code < 600) {
        echo "Error: Unable to process URL $url (status code: $status_code)\n";
        return;
    }

    // Create a stream context with the timeout option
    $context = stream_context_create([
        'http' => [
            'timeout' => $timeout
        ]
    ]);

    // Load the sitemap content
    $xml = simplexml_load_string(file_get_contents($url, false, $context));

    // Check if the sitemap contains other sitemaps
    if (isset($xml->sitemap)) {
        // Loop through the sitemaps
        foreach ($xml->sitemap as $sitemap) {
            // Recursively call the extract_links function on the sitemap URL
            extract_links($sitemap->loc);

            // Store information about the sitemap
            $sitemaps[] = (string)$sitemap->loc;
            if (isset($sitemap->lastmod)) {
                $sitemaps_last_mods[] = (string)$sitemap->lastmod;
            }
            if (isset($sitemap->changefreq)) {
                $sitemaps_change_freqs[] = (string)$sitemap->changefreq;
            }
            if (isset($sitemap->priority)) {
                $sitemaps_priorities[] = (string)$sitemap->priority;
            }
        }
    } else if (isset($xml->url)) {
        // Loop through the URLs in the sitemap
        foreach ($xml->url as $url) {
            // Store information about the URL
            $html_page_urls[] = (string)$url->loc;
            if (isset($url->lastmod)) {
                $html_page_last_mods[] = (string)$url->lastmod;
            }
            if (isset($url->changefreq)) {
                $html_page_change_freqs[] = (string)$url->changefreq;
            }
            if (isset($url->priority)) {
                $html_page_priorities[] = (string)$url->priority;
            }
        }
    }
}

// Scrape meta data from HTML pages
foreach ($html_page_urls as $url) {
    scrape_page_data($url);
}

// Meta Data & Title Extractor.
function scrape_page_data($url)
{
    global $html_page_meta_names, $html_page_meta_descriptions, $html_page_titles;

    // Check the status code of the URL
    $status_code = get_status_code($url);
    if ($status_code >= 400 && $status_code < 600) {
        echo "Error: Unable to process URL $url (status code: $status_code)\n";
        return;
    }

    // Load the HTML content
    @$doc = new DOMDocument();
    @$doc->loadHTML(file_get_contents($url));

    // Extract meta tags
    foreach ($doc->getElementsByTagName('meta') as $meta) {
        if ($meta->hasAttribute('name') &&$meta->hasAttribute('content')) {
            $name = strtolower($meta->getAttribute('name'));
            if ($name === 'description') {
                $html_page_meta_descriptions[] = trim($meta->getAttribute('content'));
            } else {
                $html_page_meta_names[] = trim($meta->getAttribute('content'));
            }
        }
    }

    // Extract title tag
    foreach ($doc->getElementsByTagName('title') as$title) {
        $html_page_titles[] = trim($title->nodeValue);
    }
}

// Get HTTP status code for a given URL
function get_status_code($url)
{
    global$timeout;

    // Create a stream context with the timeout option
    $context=stream_context_create([
        'http'=>[
            'timeout'=>$timeout,
            'method'=>'HEAD'
        ]
    ]);

    // Send a HEAD request to get only headers and not body content
   @$headers=get_headers($url,0,$context);

   // Check if headers were returned
   if ($headers===false){
       return false;
   }

   // Get the first header line(status line)
   $status_line=$headers[0];

   // Extract the status code from the status line
   list(,$status_code)=explode(' ',$status_line);

   // Return the status code
   return(int)$status_code;
}

// Print out the extracted data in a more readable format
echo "<pre>";
echo "SITEMAPS:\n";
echo print_r($sitemaps,true);
echo "SITEMAPS LAST MODS:\n";
echo print_r($sitemaps_last_mods,true);
echo "SITEMAPS CHANGE FREQS:\n";
echo print_r($sitemaps_change_freqs,true);
echo "SITEMAPS PRIORITIES:\n";
echo print_r($sitemaps_priorities,true);

echo "HTML PAGE URLS:\n";
echo print_r($html_page_urls,true);
echo "HTML PAGE LAST MODS:\n";
echo print_r($html_page_last_mods,true);
echo "HTML PAGE CHANGE FREQS:\n";
echo print_r($html_page_change_freqs,true);
echo "HTML PAGE PRIORITIES:\n";
echo print_r($html_page_priorities,true);

echo "HTML PAGE META NAMES:\n";
echo print_r($html_page_meta_names,true);
echo "HTML PAGE META DESCRIPTIONS:\n";
echo print_r($html_page_meta_descriptions,true);
echo "HTML PAGE TITLES:\n";
echo print_r($html_page_titles,true);
echo "</pre>";

?>

May 25, 2023

ChatGpt Fixed My Crawler – & Derived 2 More Versions

@dani

I got ChatGpt to fix my Crawler that was showing error.
This code that was showing error:

My Buggy Code

<?php

//START OF SCRIPT FLOW.

//Preparing Crawler & Session: Initialising Variables.

//Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only.
//SiteMaps Details Scraped from SiteMaps or Xml Files.
$sitemaps  = []; //This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml).
$sitemaps_last_mods  = []; //This will list dates of SiteMap pages last modified - found on Sitemaps.
$sitemaps_change_freqs  = []; //his will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps.
$sitemaps_priorities  = []; //This will list SiteMap pages priorities - found on Sitemaps.

//Webpage Details Scraped from SiteMaps or Xml Files.
$html_page_urls  = []; //This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml).
$html_page_last_mods  = []; //This will list dates of html pages last modified - found on Sitemap.
$html_page_change_freqs  = []; //his will list dates of html pages frequencies of page updates - found on Sitemaps.
$html_page_priorities  = []; //This will list html pages priorities - found on Sitemaps.

//Preparing $ARRAYS For Step 2: To Deal with html pages meant for Human Visitors only.
//Data Scraped from Html Files. Not Xml SiteMap Files.
$html_page_meta_names  = []; //This will list crawled pages Meta Tag Names - found on html pages.
$html_page_meta_descriptions  = []; //This will list crawled pages Meta Tag Descriptions - found on html pages.
$html_page_titles  = []; //This will list crawled pages Titles - found on html pages.
// -----

//Step 1: Initiate Session - Feed Xml SiteMap Url. Crawing Starting Point.
//Crawl Session Starting Page/Initial Xml Sitemap. (NOTE: Has to be .xml SItemap).
//$initial_url = "https://www.rocktherankings.com/sitemap_index.xml"; //Has more xml files.
$initial_url = "http://localhost/Work/buzz/Templates/0.xml";

//$xmls = file_get_contents($initial_url); //Should I stick to this line or below line ?
//Parse the sitemap content to object
//$xml = simplexml_load_string($xmls); //Should I stick to this line or above line ?
$xml = simplexml_load_string(file_get_contents($initial_url)); //Code from Dani: https://www.daniweb.com/programming/web-development/threads/540168/what-to-lookout-for-to-prevent-crawler-traps

$dom = new DOMDocument();
$dom->loadXML($xml); //LINE: 44z
//$result = @$dom->loadXML($xml); //LINE: 44

echo __LINE__; echo '<br>'; //LINE: 46

extract_links($xml);

echo __LINE__; echo '<br>';  //LINE: 50

foreach($sitemaps AS $sitemap)
{
    echo __LINE__; echo '<br>';
    extract_links($sitemap); //Extract Links on page.
}

foreach($html_page_urls AS $html_page_url)
{
    echo __LINE__; echo '<br>';
    $scrape_page_data($html_page_url); //Extract Links on page.
}

//END OF SCRIPT FLOW.

//DUNCTIONS BEYOND THIS POINT.

//Links Extractor.
function extract_links()
{
    echo __LINE__; echo '<br>';  //LINE: 73

    GLOBAL $dom;
    //Trigger following IF/ELSEs on each Crawled Page to check for link types. Whether Links lead to more SiteMaps (.xml) or webpages (.html, .htm, .php, etc.).
    if ($dom->nodeName === 'sitemapindex')  //Current Xml SiteMap Page lists more Xml SiteMaps. Lists links to Xml links. Not lists links to html links.
    {
        echo __LINE__; echo '<br>';

        //parse the index
        // retrieve properties from the sitemap object
        foreach ($xml->sitemapindex as $urlElement) //Extracts xml file urls.
        {
            // get properties
            $sitemaps[] = $sitemap_url = $urlElement->loc;
            $sitemaps_last_mods[] = $last_mod = $urlElement->lastmod;
            $sitemaps_change_freqs[] = $change_freq = $urlElement->changefreq;
            $sitemaps_priorities[] = $priority = $urlElement->priority;

            // print out the properties
            echo 'url: '. $sitemap_url . '<br>';
            echo 'lastmod: '. $last_mod . '<br>';
            echo 'changefreq: '. $change_freq . '<br>';
            echo 'priority: '. $priority . '<br>';

            echo '<br>---<br>';
        }
    } 
    else if ($dom->nodeName === 'urlset')  //Current Xml SiteMap Page lists no more Xml SiteMap links. Lists only html links.
    {
        echo __LINE__; echo '<br>';

        //parse url set
        // retrieve properties from the sitemap object
        foreach ($xml->urlset as $urlElement) //Extracts Sitemap Urls.
        {
            // get properties
            $html_page_urls[] = $html_page_url = $urlElement->loc;
            $html_page_last_mods[] = $last_mod = $urlElement->lastmod;
            $html_page_change_freqs[] = $change_freq = $urlElement->changefreq;
            $html_page_priorities[] = $priority = $urlElement->priority;

            // print out the properties
            echo 'url: '. $html_page_url . '<br>';
            echo 'lastmod: '. $last_mod . '<br>';
            echo 'changefreq: '. $change_freq . '<br>';
            echo 'priority: '. $priority . '<br>';

            echo '<br>---<br>';
        }
    } 

    GLOBAL $sitemaps;
    GLOBAL $sitemaps_last_mods;
    GLOBAL $sitemaps_change_freqs;
    GLOBAL $sitemaps_priorities;

    GLOBAL $html_page_urls;
    GLOBAL $html_page_last_mods;
    GLOBAL $html_page_change_freqs;
    GLOBAL $html_page_priorities;

    echo 'SiteMaps Crawled: ---'; echo '<br><br>'; 
    if(array_count_values($sitemaps)>0)
    {   
        print_r($sitemaps);
        echo '<br>';
    }
    elseif(array_count_values($sitemaps_last_mods)>0)
    {   
        print_r($sitemaps_last_mods);
        echo '<br>';
    }
    elseif(array_count_values($sitemaps_change_freqs)>0)
    {   
        print_r($sitemaps_change_freqs);
        echo '<br>';
    }
    elseif(array_count_values($sitemaps_priorities)>0)
    {   
        print_r($sitemaps_priorities);
        echo '<br><br>'; 
    }

    echo 'Html Pages Crawled: ---'; echo '<br><br>'; 

    if(array_count_values($html_page_urls)>0)
    {   
        print_r($html_page_urls);
        echo '<br>';
    }
    if(array_count_values($html_page_last_mods)>0)
    {   
        print_r($html_page_last_mods);
        echo '<br>';
    }
    if(array_count_values($html_page_change_freqs)>0)
    {   
        print_r($html_page_change_freqs);
        echo '<br>';
    }
    if(array_count_values($html_page_priorities)>0)
    {   
        print_r($html_page_priorities);
        echo '<br>';
    }
}

//Meta Data & Title Extractor.
function scrape_page_data()
{
    GLOBAL $html_page_urls;
    if(array_count_values($html_page_urls)>0)
    {       
        foreach($html_page_urls AS $url)
        {
            // https://www.php.net/manual/en/function.file-get-contents
            $html = file_get_contents($url);

            //https://www.php.net/manual/en/domdocument.construct.php
            $doc = new DOMDocument();

            // https://www.php.net/manual/en/function.libxml-use-internal-errors.php
            libxml_use_internal_errors(true);

            // https://www.php.net/manual/en/domdocument.loadhtml.php
            $doc->loadHTML($html, LIBXML_COMPACT|LIBXML_NOERROR|LIBXML_NOWARNING);

            // https://www.php.net/manual/en/function.libxml-clear-errors.php
            libxml_clear_errors();

            // https://www.php.net/manual/en/domdocument.getelementsbytagname.php
            $meta_tags = $doc->getElementsByTagName('meta');

            // https://www.php.net/manual/en/domnodelist.item.php
            if ($meta_tags->length > 0)
            {
                // https://www.php.net/manual/en/class.domnodelist.php
                foreach ($meta_tags as $tag)
                {
                    // https://www.php.net/manual/en/domnodelist.item.php
                    echo 'Meta Name: ' .$meta_name = $tag->getAttribute('name'); echo '<br>';
                    echo 'Meta Content: ' .$meta_content = $tag->getAttribute('content');  echo '<br>';
                    $html_page_meta_names[] = $meta_name;
                    $html_page_meta_descriptions[] = $meta_content;
                }
            }

            //EXAMPLE 1: Extract Title
            $title_tag = $doc->getElementsByTagName('title');
            if ($title_tag->length>0)
            {
                echo 'Title: ' .$title = $title_tag[0]->textContent; echo '<br>';
                $html_page_titles[] = $title;
            }

            //EXAMPLE 2: Extract Title
            $title_tag = $doc->getElementsByTagName('title');

            for ($i = 0; $i < $title_tag->length; $i++) {
                echo 'Title: ' .$title = $title_tag->item($i)->nodeValue . "\n";
                $html_page_titles[] = $title;
            }
        }
    }
}

if(array_count_values($html_page_meta_names)>0)
{   
    print_r($html_page_meta_names);
    echo '<br>';
}

if(array_count_values($html_page_meta_descriptions)>0)
{   
    print_r($html_page_meta_descriptions);
    echo '<br>';
}

if(array_count_values($html_page_titles)>0)
{   
    print_r($html_page_titles);
    echo '<br>';
}

//END OF FUNCTIONS.

ChatGpt fixed it to the following. Do let me know if the code is ok or not. It is working.

Crawler v1

<?php
ini_set('display_errors', 1);
ini_set('display_startup_errors', 1);
error_reporting(E_ALL);

// Preparing Crawler & Session: Initializing Variables.

// Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only.
// SiteMaps Details Scraped from SiteMaps or Xml Files.
$sitemaps = []; // This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml).
$sitemaps_last_mods = []; // This will list dates of SiteMap pages last modified - found on Sitemaps.
$sitemaps_change_freqs = []; // This will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps.
$sitemaps_priorities = []; // This will list SiteMap pages priorities - found on Sitemaps.

// Webpage Details Scraped from SiteMaps or Xml Files.
$html_page_urls = []; // This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml).
$html_page_last_mods = []; // This will list dates of html pages last modified - found on Sitemap.
$html_page_change_freqs = []; // This will list dates of html pages frequencies of page updates - found on Sitemaps.
$html_page_priorities = []; // This will list html pages priorities - found on Sitemaps.

// Step 1: Initiate Session - Feed Xml SiteMap URL. Crawling Starting Point.
$initial_url = "http://localhost/Work/buzz/Templates/0.xml";
$xml = simplexml_load_file($initial_url);
$dom = new DOMDocument();
$dom->loadXML($xml->asXML());

echo __LINE__ . '<br>';

crawl_sitemaps($xml);

foreach ($html_page_urls as $html_page_url) {
    echo __LINE__ . '<br>';
    scrape_page_data($html_page_url); // Extract Meta Data and Title from HTML page.
}

// END OF SCRIPT FLOW.

// FUNCTIONS BEYOND THIS POINT.

// Crawl SiteMaps.
function crawl_sitemaps($xml)
{
    global $sitemaps;
    global $html_page_urls;

    if ($xml->getName() === 'sitemapindex') {
        foreach ($xml->sitemap as $urlElement) {
            $sitemaps[] = $sitemap_url = (string)$urlElement->loc;
            $sitemaps_last_mods[] = $last_mod = (string)$urlElement->lastmod;
            $sitemaps_change_freqs[] = $change_freq = (string)$urlElement->changefreq;
            $sitemaps_priorities[] = $priority = (string)$urlElement->priority;

            echo 'sitemap_url: ' . $sitemap_url . '<br>';
            echo 'last_mod: ' . $last_mod . '<br>';
            echo 'change_freq: ' . $change_freq . '<br>';
            echo 'priority: ' . $priority . '<br>';

            echo '<br>---<br>';

            $sitemap_xml = simplexml_load_file($sitemap_url);
            crawl_sitemaps($sitemap_xml); // Recursively crawl nested sitemaps.
        }
    } elseif ($xml->getName() === 'urlset') {
        foreach ($xml->url as $urlElement) {
            $html_page_urls[] = $html_page_url = (string)$urlElement->loc;
            $html_page_last_mods[] = $last_mod = (string)$urlElement->lastmod;
            $html_page_change_freqs[] = $change_freq = (string)$urlElement->changefreq;
            $html_page_priorities[] = $priority = (string)$urlElement->priority;

            echo 'html_page_url: ' . $html_page_url . '<br>';
            echo 'last_mod: ' . $last_mod . '<br>';
            echo 'change_freq: ' . $change_freq . '<br>';
            echo 'priority: ' . $priority . '<br>';

            echo '<br>---<br>';
        }
    }

    echo 'SiteMaps Crawled: ---<br><br>';
    print_r($sitemaps);
    echo '<br><br>';

    echo 'HTML Pages Crawled: ---<br><br>';
    print_r($html_page_urls);
    echo '<br><br>';
}

// Meta Data & Title Extractor.
function scrape_page_data($html_page_url)
{
    $html = file_get_contents($html_page_url);

    $doc = new DOMDocument();
    libxml_use_internal_errors(true);
    $doc->loadHTML($html, LIBXML_COMPACT | LIBXML_NOERROR | LIBXML_NOWARNING);
    libxml_clear_errors();

    $meta_tags = $doc->getElementsByTagName('meta');
    if ($meta_tags->length > 0) {
        foreach ($meta_tags as $tag) {
            echo 'Meta Name: ' . $meta_name = $tag->getAttribute('name') . '<br>';
            echo 'Meta Content: ' . $meta_content = $tag->getAttribute('content') . '<br>';
        }
    }

    $title_tag = $doc->getElementsByTagName('title');
    if ($title_tag->length > 0) {
        echo 'Title: ' . $title = $title_tag[0]->textContent . '<br>';
    }
}
?>

May 15, 2023

DomDocument Parser Best Practice Questions

@dani

Checking these 2 working codes of your's out. I got some basic questions.

<?php

ini_set('display_errors',1);
ini_set('display_startup_errors',1);
error_reporting(E_ALL);

//Dan's Code.
//Code from: https://www.daniweb.com/programming/web-development/threads/538868/simplehtmldom-failing#post2291972
//Sitemap Protocol: https://www.sitemaps.org/protocol.html

// Initiate ability to manipulate the DOM and load that baby up
$doc = new DOMDocument();

$message = file_get_contents('https://www.daniweb.com/programming/web-development/threads/538868/simplehtmldom-failing#post2288453');

// https://www.php.net/manual/en/function.libxml-use-internal-errors.php
libxml_use_internal_errors(true);

// https://www.php.net/manual/en/domdocument.loadhtml.php
$doc->loadHTML($message, LIBXML_NOENT|LIBXML_COMPACT);

// https://www.php.net/manual/en/function.libxml-clear-errors.php
libxml_clear_errors();

// Fetch all <a> tags
$links = $doc->getElementsByTagName('a');

// If <a> tags exist ...
if ($links->length > 0)
{
    // For each <a> tag ...
    foreach ($links AS $link)
    {
        $link->setAttribute('class', 'link-style');
    }
}
// Because we are actually manipulating the DOM, DOMDocument will add complete <html><body> tags we need to strip out
$message = str_replace(array('<body>', '</body>'), '', $doc->saveHTML($doc->getElementsByTagName('body')->item(0)));

?>

<?php

ini_set('display_errors',1);
ini_set('display_startup_errors',1);
error_reporting(E_ALL);

//Dan's Code.
//CODE FROM: https://www.daniweb.com/programming/web-development/threads/540121/how-to-extract-meta-tags-using-domdocument
$url = "https://www.daniweb.com/programming/web-development/threads/540013/how-to-find-does-not-contain-or-does-contain";

// https://www.php.net/manual/en/function.file-get-contents
$html = file_get_contents($url);

//https://www.php.net/manual/en/domdocument.construct.php
$doc = new DOMDocument();

// https://www.php.net/manual/en/function.libxml-use-internal-errors.php
libxml_use_internal_errors(true);

// https://www.php.net/manual/en/domdocument.loadhtml.php
$doc->loadHTML($html, LIBXML_COMPACT|LIBXML_NOERROR|LIBXML_NOWARNING);

// https://www.php.net/manual/en/function.libxml-clear-errors.php
libxml_clear_errors();

//EXTRACT METAS
// https://www.php.net/manual/en/domdocument.getelementsbytagname.php
$meta_tags = $doc->getElementsByTagName('meta');

// https://www.php.net/manual/en/domnodelist.item.php
if ($meta_tags->length > 0)
{
    // https://www.php.net/manual/en/class.domnodelist.php
    foreach ($meta_tags as $tag)
    {
        // https://www.php.net/manual/en/domnodelist.item.php
        echo 'Name: ' .$name = $tag->getAttribute('name'); echo '<br>';
        echo 'Content: ' .$content = $tag->getAttribute('content');  echo '<br>';
    }
}

//EXAMPLE 1: EXTRACT TITLE
//CODE FROM: https://www.daniweb.com/programming/web-development/threads/540121/how-to-extract-meta-tags-using-domdocument
$title_tag = $doc->getElementsByTagName('title');
if ($title_tag->length>0)
{
    echo 'Title: ' .$title = $title_tag[0]->textContent; echo '<br>';
}

?>

Q1.
On the first code, you wrote new DOMDocument(); prior to file_get_contents().
While on the second code, you did vice versa. using my logic, I reckon it does not matter the order. But what is best practice to speeden-up the php interpreter to handle the job faster ?

Q2.
On both the codes, you wrote ...

// https://www.php.net/manual/en/function.libxml-use-internal-errors.php
libxml_use_internal_errors(true);

// https://www.php.net/manual/en/domdocument.loadhtml.php
$doc->loadHTML($html, LIBXML_COMPACT|LIBXML_NOERROR|LIBXML_NOWARNING);

// https://www.php.net/manual/en/function.libxml-clear-errors.php
libxml_clear_errors();

... after the new DOMDocument() AND file_get_contents().

Does it have to be in this order or can I add thesese 3 error lines before the
new DOMDocument() AND file_get_contents() ?
Using my logic, I reckon it does not matter the order. But what is best practice to speeden-up the php interpreter to handle the job faster ?

But, I prefer to add them at the top instead. Is this ok ?

Q3.
On the first code, you put these error lines ...

// https://www.php.net/manual/en/domdocument.loadhtml.php
$doc->loadHTML($message, LIBXML_NOENT|LIBXML_COMPACT);

... while on the second code, another ...

// https://www.php.net/manual/en/domdocument.loadhtml.php
$doc->loadHTML($html, LIBXML_COMPACT|LIBXML_NOERROR|LIBXML_NOWARNING);

... why you did like this ? What is the significance of doing like this ?

Q3A. What issue will I face if I do vice versa ?
Q3B. Anyway, what is the wisdom behind the way you did things ?
Q3C. What is the REAL difference between the two error codes ?
Q3D. LIBXML_NOENT|LIBXML_COMPACT what do these 2 mean ?

Q4. Anything else I need to know ?

May 12, 2023

How To Integrate Html Form With Payment Gateway ?

Folks,

I have never programmed any API stuff. Let us change this tonight.
I want to allow people to pay me with BitCoin on my website.
This you see below is API vofr of the Official BitCoin Payment Gateway.
Now, show me how to integrate this on my website. Meaning, show how to write the html form so the following api php code is integrated with the html form.

Php Api code:
https://codepal.ai/code-generator/query/wTcOQ1Ps/php-bitcoin-payment-gateway-api-confirmation

/**
 * This function confirms a payment made to a website using the official Bitcoin payment gateway API.
 *
 * @param string $transaction_id The unique transaction ID generated by the Bitcoin payment gateway
 * @param float $amount The amount of Bitcoin paid by the customer
 * @param string $customer_address The Bitcoin address of the customer who made the payment
 * @param string $website_address The Bitcoin address of the website receiving the payment
 * @param string $api_key The API key provided by the Bitcoin payment gateway
 *
 * @return bool Returns true if the payment is confirmed, false otherwise
 */
function confirmBitcoinPayment($transaction_id, $amount, $customer_address, $website_address, $api_key) {
    // Initialize cURL
    $ch = curl_init();

    // Set the cURL options
    curl_setopt($ch, CURLOPT_URL, "https://api.bitcoinpaymentgateway.io/v1/confirm_payment");
    curl_setopt($ch, CURLOPT_POST, 1);
    curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query([
        'transaction_id' => $transaction_id,
        'amount' => $amount,
        'customer_address' => $customer_address,
        'website_address' => $website_address,
        'api_key' => $api_key
    ]));
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

    // Execute the cURL request
    $response = curl_exec($ch);

    // Check for errors
    if (curl_errno($ch)) {
        error_log("Error confirming Bitcoin payment: " . curl_error($ch));
        curl_close($ch);
        return false;
    }

    // Close the cURL connection
    curl_close($ch);

    // Parse the response
    $response = json_decode($response, true);

    // Check if the payment is confirmed
    if ($response['status'] == 'confirmed') {
        return true;
    } else {
        return false;
    }
}

Your sample html form should give me an idea how things should get integrated.

May 12, 2023

What To Lookout For To Prevent Crawler Traps ?

Experienced Fellow Programmers,

I asking questions to those who have experiences with web crawlers.
I do not want my web crawler getting trapped onto some domain, while crawling it. Trapped and going in a loop for some reason. And so, what to look-out for to prevent loops ?

1.I know crawlers should not spider dynamic urls as they can go in a neverending loop. And so, apart from that, what other dangers are there ?

2.I know I have to program the crawler to avoid trying crawl pages that are dead. And so, got to lookout for 404 pages. And what other numbers got to lookout for ? I need a list of error numbers to feed my crawler.

3.I do not want any hacker/crook/fraud calling my crawler (pinging it) to crawl bad natured pages. Pages that are phishing pages. And so, how do I write code for my crawler to identify phishing pages so it does not crawl or index them on my searchengine ?

4.I do not want any hacker/crook/fraud calling my crawler (pinging it) to crawl his pages that are infected with virus, worm, ant, spyware, etc. Pages that will infect my crawler to carry infections to other domains it crawls afterwards. And so, how do I write code for my crawler to identify infected pages so it does not crawl or index them on my searchengine nor carry the infections to third party domains ?

When I asked 4 times above, "How do I code ?", I meant, "which php fuctions you want me to look into ?".
Anything else I got to program my crawler to watch-out for ?
Good questions. hey ?

May 2, 2023

How To Extract Meta Tags Using DomDocument ?

Folks,

Using DomDocument, I am trying to build a crawler that, when I feed it a starting point url (initial url to start the crawling & link extracting from), it should navigate to the starting url and extract all the links found on the page.

<?php

$xml = file_get_contents($sitemapUrl); //Should I stick to this line or below line ?
// parse the sitemap content to object
$xml = simplexml_load_string($sitemapUrl); //Should I stick to this line or above line ?

$dom = new DOMDocument();
$dom->loadXML($xml);
if ($dom->nodeName === 'sitemapindex')
{
    //parse the index
    // retrieve properties from the sitemap object
    foreach ($xml->urlset as $urlElement) //Extracts html file urls.
    {
        // get properties
        $url = $urlElement->loc;
        $lastmod = $urlElement->lastmod;
        $changefreq = $urlElement->changefreq;
        $priority = $urlElement->priority;

        // print out the properties
        echo 'url: '. $url . '<br>';
        echo 'lastmod: '. $lastmod . '<br>';
        echo 'changefreq: '. $changefreq . '<br>';
        echo 'priority: '. $priority . '<br>';

        echo '<br>---<br>';
    }
} 
else if ($dom->nodeName === 'urlset')
{
    //parse url set
    // retrieve properties from the sitemap object
    foreach ($xml->sitemapindex as $urlElement) //Extracts Sitemap Urls.
    {
        // get properties
        $url = $urlElement->loc;
        $lastmod = $urlElement->lastmod;
        $changefreq = $urlElement->changefreq;
        $priority = $urlElement->priority;

        // print out the properties
        echo 'url: '. $url . '<br>';
        echo 'lastmod: '. $lastmod . '<br>';
        echo 'changefreq: '. $changefreq . '<br>';
        echo 'priority: '. $priority . '<br>';

        echo '<br>---<br>';
    }
}

Now, how to write code to extract meta tags using DomDocument ?
Where can I find the code here ?
https://www.php.net/domdocument

April 30, 2023

PAGINATION A – mysqli_stmtm_store_result() Question

Hello There,

Look at this PAGINATION I built.
This script I built from scratch and is working fine to query my Mysql DB and show results.
But I need your feed-back to know if I managed to use the functions in the correct order or not.

//FUNCTIONS IN USE TO QUERY DATABASE:
//mysqli_stmt_store_result().
//mysqli_stmt_free_result().
//$rows_count = mysqli_stmt_num_rows($stmt).
//mysqli_stmt_get_result().

//FUNCTIONS IN USE TO BUILD PAGINATION SECTION
//urlencode().
//rawurlencode().
//htmlspecialchars().

I have a few questions.

Q1. Is it true that, I need to use mysqli_stmt_store_result($stmt) prior to using mysqli_stmt_num_rows($stmt) ?

Q2. Is it true that, I need to use mysqli_stmt_free_result($stmt) after every mysqli_stmt_store_result($stmt) ?

<?php
//FUNCTIONS IN USE TO QUERY DATABASE:
//mysqli_stmt_store_result().
//mysqli_stmt_free_result().
//$rows_count =  mysqli_stmt_num_rows($stmt).
//mysqli_stmt_get_result().

//FUNCTIONS IN USE TO BUILD PAGINATION SECTION
//urlencode().
//rawurlencode().
//htmlspecialchars().

//Report Error.
ini_set('display_errors',1);
ini_set('display_startup_errors',1);
error_reporting(E_ALL);

//Valid $_GET Items.
$tbls = array('spidered_web_index','$submitted_web_index');

$spidered_web_index = array('id','date_and_time','domain','url','title','header','meta_keyword','meta_description','keyword','keyphrase');
$submitted_web_index = array('id','date_and_time','domain','url','title','description','country','keyword','keyphrase');

//Extract $_GETs.
$tbl = !EMPTY($_POST['tbl'])?strtolower($_POST['tbl']):(!EMPTY($_GET['tbl'])?strtolower($_GET['tbl']):'listed_links');
$input_1 = !EMPTY($_GET['input_1'])?$_GET['input_1']:die('Make your input for us to search!');
$input_2 = !EMPTY($_GET['input_2'])?$_GET['input_2']:null;
$col_1 = !EMPTY($_GET['col_1'])?strtolower($_GET['col_1']):die('Input MySql Column to search!');
$col_2 = !EMPTY($_GET['col_2'])?strtolower($_GET['col_2']):null;
$bool = !EMPTY($_GET['bool'])?strtolower($_GET['bool']):null;
$page = !EMPTY($_GET['pg'])?intval($_GET['pg']):1;
$limit = !EMPTY($_GET['lmt'])?intval($_GET['lmt']):1;
$offset = ($page*$limit)-$limit;

if(ISSET($col_2))
{
    if(!in_array($col_2,$links_table_columns))
    {
        die('Invalid Mysql Table!');
    }
}

if(!in_array($col_1,$links_table_columns))
{
    die('Invalid Mysql Table!');
}

//Query DB.
mysqli_report(MYSQLI_REPORT_ERROR|MYSQLI_REPORT_STRICT);

$conn = mysqli_connect("localhost","root","","buzz"); //mysqli_connect("server","user","password","db");

mysqli_set_charset($conn,'utf8mb4');

if(mysqli_connect_errno())
{
    printf("Mysqli Connection Error: %s",mysqli_connect_error());
}

$stmt = mysqli_stmt_init($conn);

if($bool=='and')
{
    $input_1 = $_GET['input_1'];
    $input_2 = $_GET['input_2'];
    $sql_count = "SELECT id,domain,word,phrase from $tbl WHERE $col_1 = ? AND $col_2 = ?";
    $sql = "SELECT id,domain,word,phrase from $tbl WHERE $col_1 = ? AND $col_2 = ? LIMIT $limit OFFSET $offset";
}
elseif($bool=='or')
{
    $input_1 = $_GET['input_1'];
    $input_2 = $_GET['input_2'];
    $sql_count = "SELECT id,domain,word,phrase from $tbl WHERE $col_1 = ? OR $col_2 = ?";
    $sql = "SELECT id,domain,word,phrase from $tbl WHERE $col_1 = ? OR $col_2 = ? LIMIT $limit OFFSET $offset";
}
else
{
    $input_1 = $_GET['input_1'];
    $sql_count = "SELECT id,domain,word,phrase from $tbl WHERE $col_1 = ?";
    $sql = "SELECT id,domain,word,phrase from $tbl WHERE $col_1 = ? LIMIT $limit OFFSET $offset";
}

if(!mysqli_stmt_prepare($stmt,$sql_count)) //Fetch All Matching Rows Number.
{
    echo 'Mysqli Error: ' .mysqli_stmt_error($stmt);
    echo '<br>';
    echo 'Mysqli Error No: ' .mysqli_stmt_errno($stmt);
}
else
{
    if($bool=='and' || $bool=='or')
    {
        mysqli_stmt_bind_param($stmt,"ss",$input_1,$input_2);
    }
    else
    {
        mysqli_stmt_bind_param($stmt,"s",$input_1);
    }

    mysqli_stmt_execute($stmt);
    mysqli_stmt_store_result($stmt); //Necessary to use with mysqli_stmt_num_rows() when SQL query is SELECT.

    //Fetch Matching Rows Count.
    //mysqli_stmt_num_rows() has to come after mysqli_stmt_store_result().
    echo 'Total Result: ' .$rows_count =  mysqli_stmt_num_rows($stmt); echo '<br><br>';
    mysqli_stmt_free_result($stmt); //Is this really necessary ?
}

if(!mysqli_stmt_prepare($stmt,$sql)) //Fetch Rows based on Row Limit per page.
{
    echo 'Mysqli Error: ' .mysqli_stmt_error($stmt);
    echo '<br>';
    echo 'Mysqli Error No: ' .mysqli_stmt_errno($stmt);
}
else
{
    if($bool=='and' || $bool=='or')
    {
        mysqli_stmt_bind_param($stmt,"ss",$input_1,$input_2);
    }
    else
    {
        mysqli_stmt_bind_param($stmt,"s",$input_1);
    }

    mysqli_stmt_execute($stmt);
    $result = mysqli_stmt_get_result($stmt);

    while($row = mysqli_fetch_array($result,MYSQLI_ASSOC))
    {
        $id = $row['id'];
        $domain = $row['domain'];
        $word = $row['word'];
        $phrase = $row['phrase'];

        echo "$id<br>";
        echo "$domain<br>";
        echo "$word<br>";
        echo "$phrase<br>";
        echo "<br>";
    }
}

mysqli_stmt_close($stmt);
mysqli_close($conn);

echo 'Total Pages: ' .$total_pages = ceil($rows_count/$limit); echo '<br><br>';

$i = 0;
while($i<$total_pages)
{
    $i++;
    if($bool=='and' || $bool=='or')
    {
        $serps_url = $_SERVER['PHP_SELF'].'?'.'tbl='.urlencode($tbl).'&'.'col_1='.urlencode($col_1).'&'.'col_2='.urlencode($col_2).'&'.'bool='.$bool.'&'.'input_1='.urlencode($input_1).'&'.'input_2='.urlencode($input_2).'&'.'lmt='.intval($limit).'&'.'pg='.intval($i);
    }
    else
    {
        $serps_url = $_SERVER['PHP_SELF'].'?'.'tbl='.urlencode($tbl).'&'.'col_1='.urlencode($col_1).'&'.'bool='.urlencode($bool).'&'.'input_1='.urlencode($input_1).'&'.'lmt='.intval($limit).'&'.'pg='.intval($i);
    }
    if($i==$page)
    {
        echo '<a href="' .htmlspecialchars($serps_url) .'">' ."<b>$i</b>" .'</a>'; //Need to add htmlspecialchars(), to convert '&' to '&amp;', when echoing link here.
    }
    else
    {
        echo '<a href="' .htmlspecialchars($serps_url) .'">' ."$i" .'</a>'; //Need to add htmlspecialchars(), to convert '&' to '&amp;', when echoing link here.
    }
}

echo '<br>';

?>

On the above code, search for the comment:
//Is this really necessary here ?
And answer that question.

Q3. Anything else I need to know apart from I should use pdo ?
Q4. Is my code bad, ok, good or great ? I reckon it is ok.

Thanks

April 30, 2023

Which Is Good Vps Host ?

Programmers,

I need you to recommend me a vps host that provides unlimited & unmetered bandwidths.
Which vps host and their pack do you recommend ?
Gonna run my own searchengine. Building one with php. Gonna start small and so do not want to rent a dedicated server just yet as need to say if my venture picks up or not. Else, waste of resourcses. And time & effort. Not to mention money.
Not interested in cloud hosting as of yet.

April 13, 2023

Impossible To Auto Generate Prepared Statements ?

Hiya,

I know it is possible to auto generate prepared statements based on table column names and numbers. But how to achieve it ?

I have many tables and I want to allow visitors to be able to search these tables.
Now, if I have 20 different tables, I do not want to be writing 20 different prepared statements as that would make the script very long and would slow down the page if tonnes of visitors simultaneously use the page from all across the globe.
And so, I want to only have one set of prepared statement based on which table the visitor selects on the webform that he wants to search.
A typical example:

Imagine these 2 tables:

$link_crawls_table_columns = array('id','date_and_time','domain','url','title','header','meta_keyword','meta_description',);

$link_submits_table_columns = array('id','date_and_time','url','header','description','keyword','keyphrase');

Now, for these 2 tables, I am having to MANUALLY write the prepared statements like this as each tables' column names and numbers are different:

if(!mysqli_stmt_prepare($stmt,$query))
    {
        echo 'Mysqli Error: ' .mysqli_stmt_error($stmt); //DEV Mode line.
        echo '<br>';
        echo 'Mysqli Error No: ' .mysqli_stmt_errno($stmt); //DEV Mode line.
    }
    else
    {
        if($index=='link_crawls')
        {
            //8 inputs.
            mysqli_stmt_bind_param($stmt,"ssssssss",$search,$search,$search,$search,$search,$search,$search,$search);
        }
        else //$index=='link_submits'.
        {
            //7 inputs.
            mysqli_stmt_bind_param($stmt,"sssssss",$search,$search,$search,$search,$search,$search,$search);
        }
    }

Now imagine, I am writing like this for 20 tables!
Code will be too long!
Anyways, look how I am echoing keyword search results from these 2 tables:

while($row = mysqli_fetch_array($result,MYSQLI_ASSOC))
        {
            if($index=='link_crawls')
            {
                $id = $row['id'];
                $date_and_time = $row['date_and_time'];
                $domain = $row['domain'];
                $url = $row['url'];
                $title = $row['title'];
                $header = $row['header'];
                $meta_keyword = $row['meta_keyword'];
                $meta_description = $row['meta_description'];
                echo "<br>";
            }
            else
            {
                $id = $row['id'];
                $date_and_time = $row['date_and_time'];
                $url = $row['url'];
                $header = $row['header'];
                $description = $row['description'];
                $keyword = $row['keyword'];
                $keyphrase= $row['keyphrase'];
                echo "<br>";
            }
        }

Now imagine, I am writing like this for 20 tables!
Too long the code will become!
You get my point ? Need to shorten the code!

That is why, I have to write php code for the code to check which table is getting selected (dropdown or checkbox) on the html form that the visitor wants to search and then check the array associated with that table for the table column names and the column numbers as these 2 data is needed to auto generate the prepared statements.
But I do not know how to do this auto generation of prepared statements. I did a little bit using beginner/intermediate level php programming, which I showed you on my above code and I am showing you some more on my below code. Look:

$table = !EMPTY($_POST['table'])?$_POST['table']:(!EMPTY($_GET['table'])?$_GET['table']:'links_crawls');

$total_table_columns = count(${$table}); echo '<br>';
$cols = array();
for($i=0;$i!==$total_table_columns;$i++)
{
    $cols[] = $col_[$i] = ${$table}[$i]; echo '<br>';
}

if($match == 'exact')
{
    $sql_count = "SELECT * from $table WHERE $col_[0] = ?";
    $sql = "SELECT * from $table WHERE $col_[0] = ?";
    for($i=1;$i!==$total_table_columns;$i++)
    {
        $sql_count .= " OR $col_[$i] = ?";
        $sql .= " OR $col_[$i] = ?";
    }
    $sql .= " OR $col_[$i] = ?";
}
else //Generate Sql for FUZZY MATCH
{
    $sql_count = "SELECT * from $table WHERE $col_[0] LIKE ?";
    $sql = "SELECT * from $table WHERE $col_[0] LIKE ?";
    for($i=1;$i!==$total_table_columns;$i++)
    {
        $sql_count .= " OR $col_[$i] LIKE ?";
        $sql .= " OR $col_[$i] LIKE ?";
    }
    $sql .= " ORDER BY DESC LIMIT $limit OFFSET $offset";
}

Above, I have semi auto generated the table column names by taking the names from the respective array that holds the table column names. And, I am also extracting the number of columns that exist in the table as I will need it for the "s". talking about this part:

mysqli_stmt_bind_param($stmt,"ss",$input_1,$input_2);

April 9, 2023

How To Find DOES NOT CONTAIn or DOES CONTAIN ?

Ladies & Gentlemen,

I got this array:

$test = array('id','date_and_time','kw_1','kw_1_point','kw_2','kw_2_point','kw_3','kw_3_point','kw_4','kw_4_point');

Now I want to echo all the values that does not conatin '_point'.
So, how to do that ?

April 5, 2023

Why I Fail To Extract Link Path Extension ?

Folks,

This is absurd!
As you know, some crawler codes on the internet exist where you get it to navigate to a page and it extracts all html links. hrefs.
Code such as this one:

//Sitemap Protocol: https://www.sitemaps.org/protocol.html

include_once('simplehtmldom_1_9_1/simple_html_dom.php');

//WORKS.
//$sitemap = 'https://www.rocktherankings.com/post-sitemap.xml';
//$sitemap = "https://www.rocktherankings.com/sitemap_index.xml"; //Has more xml files.

//FAILS. Shows blank page.
$sitemap = "https://bytenota.com/sitemap.xml";

$html = new simple_html_dom();
$html->load_file($sitemap);

foreach($html->find("loc") as $link)
{
    echo $link->innertext."<br>";
}

And there are those that extract links from xml files.
Like this one:

//Sitemap Crawler: If starting url is an xml file listing further xml files then it will show blank page and not visit the found xml files to extract links from them.
//Sitemap Protocol: https://www.sitemaps.org/protocol.html

// sitemap url or sitemap file
//FAILS.
//$sitemap = "https://www.rocktherankings.com/sitemap_index.xml"; //Has more xml files.
//WORKS
//$sitemap = "https://bytenota.com/sitemap.xml";
//$sitemap = 'https://www.rocktherankings.com/post-sitemap.xml';

// get sitemap content
$content = file_get_contents($sitemap);

// parse the sitemap content to object
$xml = simplexml_load_string($content);

// retrieve properties from the sitemap object
foreach ($xml->url as $urlElement) 
{
    // get properties
    $url = $urlElement->loc;
    $lastmod = $urlElement->lastmod;
    $changefreq = $urlElement->changefreq;
    $priority = $urlElement->priority;

    // print out the properties
    echo 'url: '. $url . '<br>';
    echo 'lastmod: '. $lastmod . '<br>';
    echo 'changefreq: '. $changefreq . '<br>';
    echo 'priority: '. $priority . '<br>';

    echo '<br>---<br>';
}

But guess what ?
Both these do not work if you get the crawlers to navigate to an xml file sitemap that lists further xml links or sitemaps.
And so, I am trying to build my own crawler, where when I set it to navigate to an xml sitemap then it should check if the listed links are href links or further xml links to more xml sitemaps.
So what I did was, I first got my crawler to navigate to an xml file.
And now I want it to extract all found links and check whether they found links are hrefs or further xml links.
If the links are hrefs, then add them to the $extracted_urls array.
Else add them to the $crawl_xml_files array.
So later on, the crawler can crawl those extracted href & xml links.
Now, I am stuck on the part where, the code fails to echo the link extensions of the found links on the initially navigated page.
It fails to extract any links to the respective arrays.
Here is the code. Test it and see for yourself where I am going wrong. I am scratching my head.

My UNWORKING CODE

//Sitemap Crawler: If starting url is an xml file listing further xml files then it will show blank page and not visit the found xml files to extract links from them.
//Sitemap Protocol: https://www.sitemaps.org/protocol.html

    //$sitemap = 'https://www.rocktherankings.com/post-sitemap.xml';
    //$sitemap = "https://www.rocktherankings.com/sitemap_index.xml"; //Has more xml files.
    $sitemap = 'https://bytenota.com/sitemap.xml';
    //$sitemap = 'https://www.daniweb.com/home-sitemap.xml';
    // get sitemap content
    //$sitemap = 'sitemap.xml';
    // get sitemap content
    $content = file_get_contents($sitemap);

    // parse the sitemap content to object
    $xml = simplexml_load_string($content);
    //var_dump($xml);
    // Init arrays
    $crawl_xml_files = [];
    $extracted_urls = [];
    $extracted_last_mods = [];
    $extracted_changefreqs = [];
    $extracted_priorities = [];
    // retrieve properties from the sitemap object
    foreach ($xml->url as $urlElement) {
        // provide path of curren xml/html file
        $path = (string)$urlElement->loc;
        // get pathinfo
        $ext = pathinfo($path, PATHINFO_EXTENSION);
        echo 'The extension is: ' . $ext;
        echo '<br>'; //DELETE IN DEV MODE

        echo $urlElement; //DELETE IN DEV MODE

        if ($ext == 'xml') //This means, the links found on the current page are not links to the site's webpages but links to further xml sitemaps. And so need the crawler to go another level deep to hunt for the site's html pages.
        {
            echo __LINE__;
            echo '<br>'; //DELETE IN DEV MODE

            //Add Xml Links to array.
            $crawl_xml_files[] = $path;
        } elseif ($ext == 'html' || $ext == 'htm' || $ext == 'shtml' || $ext == 'shtm' || $ext == 'php' || $ext == 'py') //This means, the links found on the current page are the site's html pages and are not not links to further xml sitemaps.
        {
            echo __LINE__;
            echo '<br>'; //DELETE IN DEV MODE

            //Add hrefs to array.
            //$extracted_urls[] = $path;

            // get properties

            $extracted_urls[] = $extracted_url = $urlElement->loc; //Add hrefs to array.
            $extracted_last_mods[] = $extracted_lastmod = $urlElement->lastmod; //Add lastmod to array.
            $extracted_changefreqs[] = $extracted_changefreq = $urlElement->changefreq; //Add changefreq to array.
            $extracted_priorities[] = $extracted_priority = $urlElement->priority; //Add priority to array.
        }
    }

    var_dump($crawl_xml_files); //Print all extracted Xml Links.
    var_dump($extracted_urls); //Print all extracted hrefs.
    var_dump($extracted_last_mods); //Print all extracted last mods.
    var_dump($extracted_changefreqs); //Print all extracted changefreqs.
    var_dump($extracted_priorities); //Print all extracted priorities.

    foreach($crawl_xml_files as $crawl_xml_file)
    {
        echo 'Xml File to crawl: ' .$crawl_xml_file; //Print all extracted Xml Links.
    }

    echo __LINE__; 
    echo '<br>'; //DELETE IN DEV MODE

    foreach($extracted_urls as $extracted_url)
    {
        echo 'Extracted Url: ' .$extracted_url; //Print all extracted hrefs.
    }

    echo __LINE__; 
    echo '<br>'; //DELETE IN DEV MODE

    foreach($extracted_last_mods as $extracted_last_mod)
    {
        echo 'Extracted last Mod: ' .$extracted_last_mod; //Print all extracted last mods.
    }

    echo __LINE__; 
    echo '<br>'; //DELETE IN DEV MODE

    foreach($extracted_changefreqs as $extracted_changefreq)
    {
        echo 'Extracted Change Frequency: ' .$extracted_changefreq; //Print all extracted changefreqs.
    }

    echo __LINE__; 
    echo '<br>'; //DELETE IN DEV MODE

    foreach($extracted_priorities as $extracted_priority)
    {
        echo 'Extracted Priority: ' .$extracted_priority; //Print all extracted priorities.
    }

    echo __LINE__; 
    echo '<br>'; //DELETE IN DEV MODE

How to fix this ?

I get this echoed ....

The extension is:
The extension is:
The extension is:
The extension is:
The extension is:
The extension is:
C:\wamp64\www\Work\buzz\Templates\crawler_Test.php:66:
array (size=0)
empty
C:\wamp64\www\Work\buzz\Templates\crawler_Test.php:67:
array (size=0)
empty
C:\wamp64\www\Work\buzz\Templates\crawler_Test.php:68:
array (size=0)
empty
C:\wamp64\www\Work\buzz\Templates\crawler_Test.php:69:
array (size=0)
empty
C:\wamp64\www\Work\buzz\Templates\crawler_Test.php:70:
array (size=0)
empty
77
85
93
101
109

Obviously, I get tonnes of lines of ...
The extension is:

April 5, 2023

simple_html_dom() ISSUE

Hiya,

I do not understand why this crawler fails to echo found links on a page.

CODE 1

//Sitemap Crawler: If starting url is an xml file listing further xml files then it will just echo the found xml files and not extract links from them.
//Sitemap Protocol: https://www.sitemaps.org/protocol.html

include_once('simplehtmldom_1_9_1/simple_html_dom.php');

//Succeeds to echo found links on these 2 pages.
//$sitemap = 'https://www.rocktherankings.com/post-sitemap.xml';
//$sitemap = "https://www.rocktherankings.com/sitemap_index.xml"; //Has more xml files.

//Does not work. Shows blank page. Crawler fails to load the page or extract any found links on the page.
//$sitemap = "https://bytenota.com/sitemap.xml";


$html = new simple_html_dom();
$html->load_file($sitemap);

foreach($html->find("loc") as $link)
{
    echo $link->innertext."<br>";
}

Issue is on this page:

//Does not work. Shows blank page. Crawler fails to load the page or extract any found links on the page.
//$sitemap = "https://bytenota.com/sitemap.xml";

What line of code do I need to add to fix this issue ?

Thanks!

April 5, 2023

How To Order By Adding Total Of All Columns In SQL?

Hiya,

I need to learn the SQL query that counts all the points from more than one column and orders the matching rows based on most points in descending order.

EXAMPLE 1:
I do a keyword search for "mobile phone tutorial apps". Note 4 words.
Sql should find all the rows that contain any of these 4 keywords.
That means, since I searched for 4 keywords, sql should even pull-up those rows that just have even 1 or 2 or 3 of these 4 keywords in them.

Say, my mysql table looks like this:


id |  kw1   | kw1_point |   kw2   |  kw2_point  |   kw3   | kw3_point |    kw4     | kw4_point 
----------------------------------------------------------------------------------------------
0  |  mobile |    3    |  phone  |     3       |  apps    |     2    |    tutorial  |    2
----------------------------------------------------------------------------------------------
1  |  mobile |    1    |  phone  |     1       |  apps    |     3    |    tutorial  |    3
----------------------------------------------------------------------------------------------
2  | tutorial |   3   |   apps   |     3       |  mobile  |     2    |   phone      |    3
----------------------------------------------------------------------------------------------
3  |  mobile |    5    |  tutorial     |      5       |  apps     |      5     |    usa        |    5

NOTE: All 4 keywords exists on the first 3 matching rows. However, only 3 words exist in the final matching row.
And the final matching row's keywords are not in the order of my keyword search.
here in Q1, this should not matter. Sql should ignore in which order the keywords are in each column when comparing the order of my searched keywords. All it should do, is:

A). Find matching rows, regardless of how many of my searched keywords exist on each row;
B). Count the totals of each points, (count more than one column in this case), in each row; And
C) List the rows in the point's descending order.

As you can see, from my example, the following are the keyword points of each row after calculating ALL their keyword points (all point columns):

id 0 = 10 points
id 1 = 8 points
id 2 = 11 points
id 3 = 20

So, in this case, the SQL query should present rows in this descending order:

id 3 = 20
id 2 = 11 points
id 0 = 10 points
id 1 = 8 points


id |  kw1   | kw1_point |   kw2   |  kw2_point  |   kw3   | kw3_point |    kw4     | kw4_point 
----------------------------------------------------------------------------------------------
3  |  mobile |    5    |  tutorial |   5       |  apps    |    5     |    usa    |    5
----------------------------------------------------------------------------------------------
2  | tutorial |   3   |   apps    |    3       |  mobile  |    2     |   phone   |    3
----------------------------------------------------------------------------------------------
0  |  mobile |    3    |  phone  |     3       |  apps    |     2    |    tutorial  |    2
----------------------------------------------------------------------------------------------
1  |  mobile |    1    |  phone  |     1       |  apps    |     3    |    tutorial  |    3
----------------------------------------------------------------------------------------------

Had there been only a single "keyword point" (kw1_point) to calculate, then I would have written the SQL like this using prepared statements:

$sql = "SELECT * from keywords WHERE kw1 = ? OR kw2 = ? OR kw3 = ? OR kw4 = ? order by kw1_point desc LIMIT 100";

The confusion arises when there is more than one column's values to count.
Now the next question ...

March 13, 2023

Which Way Better To Check For Username & Password Correction Out Of These 2

Hiya,

It seems to me that if I put a condition on

mysqli_stmt_fetch($stmt);

to check if user credentials was ok enough for the data to be fetched, then the whole purpose of using the following function is defeated.

password_verify()

This is what I mean ..

function process_login_form()
{
    //Query DB.
    //Check if User already logined or not.
    mysqli_report(MYSQLI_REPORT_ERROR|MYSQLI_REPORT_STRICT);
    $conn = mysqli_connect("localhost","root","","buzz");
 mysqli_connect("server","user","password","db");

    $stmt = mysqli_stmt_init($conn);
    $sql_count = "SELECT password FROM domains WHERE domain_email = ?";

    if(!mysqli_stmt_prepare($stmt,$sql_count))
    {
        unset_sessions();
        echo 'ERROR 1: Something went wrong. Please try again later!';
    }
    else
    {
        mysqli_stmt_bind_param($stmt,"s",$_SESSION['domain_email']);
        mysqli_stmt_execute($stmt);
        mysqli_stmt_bind_result($stmt,$db_password);
        mysqli_stmt_fetch($stmt);
        if(!mysqli_stmt_fetch($stmt))
        {
            echo __LINE__; echo '<br>';
            'ERROR 2: Fetching failed';
        }

        if(!password_verify(trim($_POST['password']),$db_password)) //Incorrect User Credentials.
        {
            echo __LINE__; echo '<br>';
            mysqli_stmt_close($stmt);
            mysqli_close($conn);

            unset_sessions();
            echo 'ERROR 3: <b>Incorrect User Credentials!</b><br>';
        }
        else //Correct User Credentials.
        {
            header('location: home_Template.php');
        }
    }
}

When I type wrong password, the script ends here:

if(!mysqli_stmt_fetch($stmt))
{
    echo __LINE__; echo '<br>';
    'ERROR 2: Fetching failed';
}

So this part prevents user logging into his account if password is wrong. Therefore, no need to check password with password_verify() if I add a condition on mysqli_stmt_fetch().
So now I conclude that, if I add condition on mysqli_stmt_fetch() then no need for me to check for passowrdcorrection with password_verify(),
And, if I want to check for password correction with password_verify() then no need to add condition on mysqli_stmt_fetch(), like so:

if(!mysqli_stmt_fetch($stmt))
{
    echo __LINE__; echo '<br>';
    'ERROR 2: Fetching failed';
}

And I should just write like this:

mysqli_stmt_fetch($stmt)

Correct or not ?
Anyway, what is the downside of checking for password correction with

if(!mysqli_stmt_fetch($stmt))
{
    echo __LINE__; echo '<br>';
    'ERROR 2: Incorrect User Credentials';
}

rather than with

if(!password_verify(trim($_POST['password']),$db_password)) //Incorrect User Credentials.
{
    echo __LINE__; echo '<br>';
    mysqli_stmt_close($stmt);
    mysqli_close($conn);

    unset_sessions();
    echo 'ERROR 3: <b>Incorrect User Credentials!</b><br>';
}
else //Correct User Credentials.
{
    header('location: home_Template.php');
}

March 8, 2023

How Do I Shorten This Conditional Code ?

Hi,

I got this long version code:

    if(ISSET($_GET['limit']))
    {
        $limit = intval($_GET['limit']);
    }
    else
    {
        $limit = intval(1);
    }

I can shorten it, like this and it works:

$limit = ISSET($_GET['limit'])?intval($_GET['limit']):1;

Now add an echo and try shortening it. And I get error.

I got this long code with echo in it:

    if(ISSET($_GET['limit']))
    {
        $limit = intval($_GET['limit']);
        echo "<option value=\"$limit\">$limit</option>";
    }
    else
    {
        $limit = intval(1);
    }

Now, how do I shorten it ?
These 2 fail! Show error:

$limit = ISSET($_GET['limit'])?intval($_GET['limit']);echo "<option value=\"$limit\">$limit</option>";:1;

$limit = (ISSET($_GET['limit'])?intval($_GET['limit']);echo "<option value=\"$limit\">$limit</option>";:1)

March 8, 2023

How To Rid Submit Button Name From Get Method Destination Url ?

Hello Folks,

How to rid GET METHOD from echoing the submitted webform's search button name in the destination url ?
My site search page's url looks like this:
http://localhost/Work/buzz/Templates/pagination_GET_METHOD_TEMPLATE.php
When I click the search button, I am taken to same page $_SERVER['PHP_SELF'];.
Taken to:
http://localhost/Work/buzz/Templates/pagination_GET_METHOD_TEMPLATE.php?search=&match=fuzzy&index=crawled_links_index&limit=1&web_search_button=

NOTE the last part of the destination url:
&web_search_button=
That is the search button's name.

<button type="submit" name="web_search_button" id="web_search_button" title="search the Web">Search!</button>

What to do so destination url looks like following when using GET METHOD ?
http://localhost/Work/buzz/Templates/pagination_GET_METHOD_TEMPLATE.php?search=&match=fuzzy&index=crawled_links_index&limit=1

March 7, 2023

Does Trim Have Closing Bracket ?

This fails:

$password_hashed = password_hash(trim($_POST['password'],PASSWORD_DEFAULT);

This also fails:

$password_hashed = password_hash(trim($_POST['password'],PASSWORD_DEFAULT));

This works:

$password_hashed = password_hash(trim($_POST['password']),PASSWORD_DEFAULT);

But I thought trim( did not have the closing bracket. Manual shows it does.
So this is why above 3rd sample working ?

March 7, 2023

Which Of These Are Valid mysqli_stmt_num_rows() Usages ?

I need to check db for matching user credential on login script.
There must be atleast one matching row. Else, script should alert user not registered.

Need to check the db using the function mysqli_stmt_num_rows().
Assignment is to list all the various valid ways this function can be used to check for mathcing rows.

I conconcted 21 different ways. But I need your assistance to point-out the invalid attempts out of the 21 different attempts. That is all.

Note the 21 different IFs below. Note the comments on each. I queried the db using correct user credentials.
That means, mysqli_stmt_num_rows() should show '1'.

I have labeled my attempts from 1 to 21. I need you to give me the label numbers of the ones that are invalid.
On my tests, whichever attempt showed one matching row found, I put PASS on the comment beside it.
And, whichever attempt showed matching row NOT found, I put FAIL on the comment.
QUESTION: Which of these following 21 are invalid or incorrect way to check for matching rows ? Give me their label numbers.

Note, I know the difference between "=", "==" and "===".
And, I know the difference between "!=", "!==" and "!===".
I added some invalid checks just for experimental purposes.

Thanks

if(!$num_rows = mysqli_stmt_num_rows($stmt)) //FAILS
{
    die('Incorrect User Credentials!');
}

if(!$num_rows==mysqli_stmt_num_rows($stmt)) //FAILS
{
    die('Incorrect User Credentials!');
}

if($num_rows!=mysqli_stmt_num_rows($stmt)) //FAILS
{
    die('Incorrect User Credentials!');
}

if($num_rows!==mysqli_stmt_num_rows($stmt)) //FAILS
{
    die('Incorrect User Credentials!');
}

if($num_rows = mysqli_stmt_num_rows($stmt)!=1) //WORKS
{
    die('Incorrect User Credentials!');
}

if($num_rows = mysqli_stmt_num_rows($stmt)!==1) //WORKS
{
    die('Incorrect User Credentials!');
}

if($num_rows = mysqli_stmt_num_rows($stmt)<1) //WORKS
{
    die('Incorrect User Credentials!');
}

if($num_rows = mysqli_stmt_num_rows($stmt)=0)//FAILS
{
    die('Incorrect User Credentials!');
}

if($num_rows = mysqli_stmt_num_rows($stmt)==0)//WORKS
{
    die('Incorrect User Credentials!');
}

if($num_rows = mysqli_stmt_num_rows($stmt)===0)//WORKS
{
    die('Incorrect User Credentials!');
}

if(!mysqli_stmt_num_rows($stmt)) //FAILS
{
    die('Incorrect User Credentials!');
}

if(mysqli_stmt_num_rows($stmt)=FALSE) //FAILS
{
    die('Incorrect User Credentials!');
}

if(mysqli_stmt_num_rows($stmt)==FALSE) //FAILS
{
    die('Incorrect User Credentials!');
}

if(mysqli_stmt_num_rows($stmt)===FALSE) //WORKS
{
    die('Incorrect User Credentials!');
}

if(!mysqli_stmt_num_rows($stmt)=TRUE) //FAILS
{
    die('Incorrect User Credentials!');
}

if(!mysqli_stmt_num_rows($stmt)==TRUE) //FAILS
{
    die('Incorrect User Credentials!');
}

if(mysqli_stmt_num_rows($stmt)!=TRUE) //FAILS
{
    die('Incorrect User Credentials!');
}

if(mysqli_stmt_num_rows($stmt)!==TRUE) //FAILS
{
    die('Incorrect User Credentials!');
}

if(mysqli_stmt_num_rows($stmt)=NULL) //FAILS
{
    die('Incorrect User Credentials!');
}

if(mysqli_stmt_num_rows($stmt)==NULL) //FAILS
{
    die('Incorrect User Credentials!');
}

if(mysqli_stmt_num_rows($stmt)===NULL) //WORKS
{
    die('Incorrect User Credentials!');
}

CONTEXT

$domain = trim($_POST['domain']);
$domain_email = trim($_POST['domain_email']);
$password_hashed = hash('sha256',trim($_POST['password']));

//Query DB.
//Check if User already registered or not.
mysqli_report(MYSQLI_REPORT_ERROR|MYSQLI_REPORT_STRICT);
$conn = mysqli_connect("localhost","root","","buzz"); //mysqli_connect("server","user","password","db");
$stmt = mysqli_stmt_init($conn);
//$sql = "SELECT id FROM domains WHERE password = ? AND (domain = ?  OR domain_email = ?)";
$sql = "SELECT id FROM domains WHERE (domain = ? OR domain_email = ?) AND password = ?";

if(!mysqli_stmt_prepare($stmt,$sql))
{
    echo __LINE__; echo '<br>';//DELETE

    echo 'Mysqli Error: ' .mysqli_stmt_error(); //DEV MODE.
    echo '<br>';
    echo 'Mysqli Error No: ' .mysqli_stmt_errno(); //DEV MODE.
    echo '<br>';
    die('Login a Failure!');
}
else
{
    echo __LINE__; echo '<br>';//DELETE

    mysqli_stmt_bind_param($stmt,"sss",$domain,$domain_email,$password_hashed);
    mysqli_stmt_execute($stmt);
    mysqli_stmt_bind_result($stmt,$id);
    if(!mysqli_stmt_fetch($stmt)) //This triggers if credentials are wrong.
    {   
        echo __LINE__; echo '<br>';//DELETE

        mysqli_stmt_close($stmt);
        mysqli_close($conn);
        die('Password fetching failed!');
    }
    else
    {
        echo __LINE__; echo '<br>';//DELETE


        //if(!$num_rows = mysqli_stmt_num_rows($stmt)) //FAILS
        //if(!$num_rows==mysqli_stmt_num_rows($stmt)) //FAILS
        //if($num_rows!=mysqli_stmt_num_rows($stmt)) //FAILS
        //if($num_rows!==mysqli_stmt_num_rows($stmt)) //FAILS
        //if($num_rows = mysqli_stmt_num_rows($stmt)!=1) //WORKS
        //if($num_rows = mysqli_stmt_num_rows($stmt)!==1) //WORKS
        //if($num_rows = mysqli_stmt_num_rows($stmt)<1) //WORKS
        //if($num_rows = mysqli_stmt_num_rows($stmt)=0)//FAILS
        //if($num_rows = mysqli_stmt_num_rows($stmt)==0)//WORKS
        //if($num_rows = mysqli_stmt_num_rows($stmt)===0)//WORKS
        //if(!mysqli_stmt_num_rows($stmt)) //FAILS

        //if(mysqli_stmt_num_rows($stmt)=FALSE) //FAILS
        //if(mysqli_stmt_num_rows($stmt)==FALSE) //FAILS
        //if(mysqli_stmt_num_rows($stmt)===FALSE) //WORKS

        //if(!mysqli_stmt_num_rows($stmt)=TRUE) //FAILS
        //if(!mysqli_stmt_num_rows($stmt)==TRUE) //FAILS
        //if(mysqli_stmt_num_rows($stmt)!=TRUE) //FAILS
        //if(mysqli_stmt_num_rows($stmt)!==TRUE) //FAILS
        //if(mysqli_stmt_num_rows($stmt)=NULL) //FAILS
        //if(mysqli_stmt_num_rows($stmt)==NULL) //FAILS
        //if(mysqli_stmt_num_rows($stmt)===NULL) //WORKS
        {
            die('Incorrect User Credentials!');
        }

    mysqli_stmt_close($stmt);
    mysqli_close($conn);

    echo __LINE__; echo '<br>';//DELETE
    echo 'password: ' .$password; echo '<br>';
    echo 'hashed password: ' .$hashed_password; echo '<br>';

    header('location: home_Template.php');
    exit;
}

Experimented half the night and some test results confused me which I need clearing.