These functions allow to get any tag from a website, read the contents and return an array with a) the contents b) the count of occurrences. Furthermore added: retrieve mails, get doctype, keywords, metas, link rel's, p tags, h tags, comments, inline used classes and id's, title tags, images, alt desc of images, separate internal links from external links and much more.
/* most functions return the content of the requested tags in array[0] */
/* and the count in array[1] except those, where a special function to */
/* retrieve the count is given */
// get rel links in header of the site
function get_link_rel($file){
$h1tags = preg_match_all('/(rel=)(".*") href=(".*")/im',$file,$patterns);
$res = array();
array_push($res,$patterns);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all h1 tags
function get_h1($file){
$h1tags = preg_match_all("/(<h1.*>)(\w.*)(<\/h1>)/isxmU",$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all h2 tags
function get_h2($file){
$h1tags = preg_match_all("/(<h2.*>)(\w.*)(<\/h2>)/isxmU",$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all h3 tags
function get_h3($file){
$h1tags = preg_match_all("/(<h3.*>)(\w.*)(<\/h3>)/ismU",$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all h4 tags
function get_h4($file){
$h1tags = preg_match_all("/(<h4.*>)(\w.*)(<\/h4>)/ismU",$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all h5 tags
function get_h5($file){
$h1tags = preg_match_all("/(<h5.*>)(\w.*)(<\/h5>)/ismU",$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all h5 tags
function get_h6($file){
$h1tags = preg_match_all("/(<h6.*>)(\w.*)(<\/h6>)/ismU",$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve p tag contents
function get_p($file){
$h1tags = preg_match_all("/(<p.*>)(\w.*)(<\/p>)/ismU",$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve names of links
function get_a_content($file){
$h1count = preg_match_all("/(<a.*>)(\w.*)(<.*>)/ismU",$file,$patterns);
return $patterns[2];
}
// retrieve link destinations
function get_a_href($file){
$h1count = preg_match_all('/(href=")(.*?)(")/i',$file,$patterns);
return $patterns[2];
}
// get count of href's
function get_a_href_count($file){
$h1count = preg_match_all('/<(a.*) href=\"(.*?)\"(.*)<\/a>/',$file,$patterns);
return count($patterns[0]);
}
//get all additional tags inside a link tag
function get_a_additionaltags($file){
$h1count = preg_match_all('/<(a.*) href="(.*?)"(.*)>(.*)(<\/a>)/',$file,$patterns);
return $patterns[3];
}
// retrieve spans on the site
function get_script($file){
$h1count = preg_match_all('/(<script.*>)(.*)(<\/script>)/imxsU',$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve content of ul's
function get_ul($file){
$h1count = preg_match_all('/(<ul \w*>)(.*)(<\/ul>)/ismxU',$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
//retrieve li contents
function get_li($file){
$h1count = preg_match_all('/(<li \w*>)(.*)(<\/li>)/ismxU',$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all used id's on the page
function get_ids($file){
$h1count = preg_match_all('/(id="(\w*)")/is',$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all used classes ( inline ) of the document
function get_classes($file){
$h1count = preg_match_all('/(class="(\w*)")/is',$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// get the meta tag contents
function get_meta_content($file){
$h1count = preg_match_all('/(<meta)(.*="(.*)").\/>/ix',$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// get inline styles
function get_styles($file){
$h1count = preg_match_all('/(style=")(.*?)(")/is',$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// get titles of tags
function get_tag_titles($file){
$h1count = preg_match_all('/(title=)"(.*)"(.*)/',$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// get image alt descriptions
function get_image_alt($file){
$h1count = preg_match_all('/(alt=.)([a-zA-Z0-9\s]{1,})/',$file,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve images on the site
function get_images($file){
$h1count = preg_match_all('/(<img)\s (src="([a-zA-Z0-9\.;:\/\?&=_|\r|\n]{1,})")/isxmU',$file,$patterns);
$res = array();
array_push($res,$patterns[3]);
array_push($res,count($patterns[3]));
return $res;
}
// retrieve email address of the mailto tag if any
function get_mailto($file){
$h1count = preg_match_all('/(<a\shref=")(mailto:)([a-zA-Z@0-9\.]{1,})"/ims',$file,$patterns);
$res = array();
array_push($res,$patterns[3]);
array_push($res,count($patterns[3]));
return $res;
}
// retrieve any email
function get_emails($file){
$h1count = preg_match_all('/[a-zA-Z0-9_-]{1,}@[a-zA-Z0-9-_]{1,}\.[a-zA-Z]{1,4}/',$file,$patterns);
$res = array();
array_push($res,$patterns[0]);
array_push($res,count($patterns[0]));
return $res;
}
// count used keywords
function countkeyword($word,$file){
$x = preg_match_all("/(.*)($word)(.*)/",$file,$patterns);
return count($patterns);
}
// retrieve the main url of the site
function get_main_url($url){
$parts = parse_url($url);
$url = $parts["scheme"] ."://".$parts["host"];
return $url;
}
// retrieve just the name without www and com/eu/de etc
function get_domain_name_only($url){
$match = preg_match("/(.*:\/\/)\w{0,}(.*)\.(.*)/",$url,$patterns);
$patterns[2] = str_replace(".","",$patterns[2]);
return $patterns[2];
}
?>
// get doc title
if(!empty($title[0])){
$title[0] = preg_replace("/</","<",$title[0]);
echo "<br/>Title found: $title[0]<";
}else{
echo"<br/><div class=\"error\">Page does not have a title</div><br/>";
}
if(!empty($styles[0])){
echo "<br/>inline styles:<ul>";
foreach($styles[0] as $key => $val){
$val = preg_replace("/</","<",$val);
echo "<li>" . htmlentities($val) . "</li>";
}
echo "<div class=\"notice\">Your document uses inline styles. If applicable, try to put them into a separate CSS file and restyle them to ID's or CLASSES.</div></ul>";
}else{
echo "<br/>No inline styles used";
}
}
?>
</body>
</html>
Ajit Raj wrote :1753
i want to say that this funtion
// retrieve images on the site
function get_images($file){
$h1count = preg_match_all('/(<img)\s (src="([a-zA-Z0-9\.;:\/\?&=_|\r|\n]{1,})")/isxmU',$file,$patterns);
$res = array();
array_push($res,$patterns[3]);
array_push($res,count($patterns[3]));
return $res;
}
this funtion is give output like images/abc.jpg when tag is <img src="images/abc.img"/>
if <img src="abc.img"/> or <img src = "abc.img"/> or <img src= "abc.img"/> or <img src ="abc.img"/> then not show this image
so plz help me how i will do it
thanks
Ajit Raj
yadav.raj07@gmail.com
Mark Urquhart-Webb wrote :1806
// retrieve any email
/* I made a mod to your get_emails() function - this one allows for two level domains (.co.uk etc). also uses shortcuts for alphanumerics. In use on www.scamdex.com
*/