Bug in retrieving and modifying pages using xpath

Question

I've written some code to modify webpages using xpath in php. The code below gets an html page, retrieves a part of it, and deletes a part of page.

It works in some scenarios, such as: http://chijoori.ir/excel-tutorial/

but fails in others such as: http://delbaraneh.com/decorations/the-latest-interior-96/

$link="http://chijoori.ir/excel-tutorial/";
$add_xpath="//h1";
$delete_xpath="";


//$link="http://delbaraneh.com/decorations/the-latest-interior-96/";
//$add_xpath="//h1";
//$delete_xpath="";


$res=array();

$result=browser_test($link);

$dom = new DOMDocument();
@$dom->loadHTML($result);
$xpath = new DOMXPath($dom);



$query=$xpath->query($add_xpath);
if($query->length>0){
    $query=$query->item(0);
    if($query!=null){
        $v=$dom->saveXML($query);
    }
}




$dom2 = new DOMDocument();
@$dom2->loadHTML(mb_convert_encoding($v, 'HTML-ENTITIES', 'UTF-8'));
$xpath2 = new DOMXPath($dom2);


$elements =$xpath2->query($delete_xpath);
if($elements){
    foreach($elements as $element){
        $v=$element->nodeValue;
        $element->parentNode->removeChild($element);
    }
}


$fullcontent=$dom2->saveXML();
echo $fullcontent;





function browser_test($url){

    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; CrawlBot/1.0.0)');
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT , 5);
    curl_setopt($ch, CURLOPT_TIMEOUT, 5);
    curl_setopt($ch, CURLOPT_ENCODING, "");
    curl_setopt($ch, CURLOPT_AUTOREFERER, true);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);    # required for https urls
    curl_setopt($ch, CURLOPT_MAXREDIRS, 15);
    $html = curl_exec($ch );
    $status = curl_getinfo($ch );
    curl_close($ch );
    if($html=="" || !$html){
        $html=file_get_contents($url);
    }
    return $html;
}

Show source
| dom   | php   | xpath   2017-01-07 19:01 0 Answers

Answers to Bug in retrieving and modifying pages using xpath ( 0 )

Leave a reply to - Bug in retrieving and modifying pages using xpath

◀ Go back