|
|
|
|
|
|
| |
| <?php
#########################################################################
# linkGrabber.php v1.1 #
# ----------- #
# Copyright (C) 2005 Aristidis Karidis, aris.karidis@bcs.org #
# ---------------------------------------------------------- #
# This function grabs the links from one or more URLs or local files. #
# #
#########################################################################
# #
# This program is free software; you can redistribute it and/or #
# modify it under the terms of the GNU General Public License #
# as published by the Free Software Foundation; either version 2 #
# of the License, or (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# ------------------------------------ #
# http://www.gnu.org/copyleft/gpl.html #
#########################################################################
/*
This function grabs the links from one or more URLs or local files.
It works with ALL links on a page. It also corrects relative links
and grabs image links as well, correcting them if they are relative.
Links inside javascript tags don't get parsed since they don't exist
until you 'do' something on the page. If applied on local file(s) it
then grabs even the links inside javascript tags.
@param array $url -- a URL, local file or array of URLs/files.
@param int $unique -- filters duplicate links if set to 1; doesn't if set to 0.
@return array
*/
function linkGrabber($url, $unique = 1)
{
$startTag = '<a ';
$hrefTag = 'href=';
$label = '';
$endTag = '>';
$closingTag = '</a>';
$counter = 0;
if(!is_array($url))
{
$url = array($url);
}
if ($unique !== 0 && $unique !== 1)
{
printf('Invalid parameter for $unique. The parameter must be either 1 or 0.');
exit();
}
foreach ($url as $value)
{
$contents = file_get_contents($value);
while ($contents)
{
set_time_limit(0); # In case we have several large pages
#################################################################
# Find the first '<a' and get the substring from there. #
# Checking for 'href' only is not enough. I might have #
# a string 'href' without it being a link. #
# Checking for '<a href' is not enough as #
#'<a class="className" href="www.example.com">' is valid. #
# '<a href="www.example.com" class="className">' is also valid. #
# Need to fix relative links and relative image links. #
#################################################################
$quotes = array('"', "'");
# Strip " and ' from input string
$contents = str_replace($quotes, '', $contents);
# Drop everything before the start tag '<a'
$contents = stristr($contents, $startTag);
# Drop everything before the 'href'
$contents = stristr($contents, $hrefTag);
# Position of the end tag '>'
$endTagPosition = stripos($contents, $endTag);
# Get everything from href to end tag --> 'href="url" something>'
$href = substr($contents, 5, $endTagPosition - 5);
# Position of space (if it exists)
$spacePosition = stripos($href, ' ');
if ($spacePosition !== false)
{
# Drop everything after space, keeping 'href="url"'
$href = substr($href, 0, $spacePosition);
}
# Drop everything before the end tag '>'
$contents = stristr($contents, $endTag);
# Position of the closing tag '</a>'
$closingTagPosition = stripos($contents, $closingTag);
# Everything between '>' and '</a>'
$label = substr($contents, 1, $closingTagPosition - 1);
#################################################
# Fix relative links for images before continue #
#################################################
# Position of the image tag '<img ' (if it exists)
$imagePosition = stripos($label, '<img');
if ($imagePosition !== false)
{
# Drop everything before the 'src='
$src = stristr($label, 'src=');
# Drop 'src='
$src = substr($src, 4);
# Position of space (if it exists)
$spacePosition = stripos($src, ' ');
if ($spacePosition !== false)
{
# Drop everything after space, keeping 'src="url"'
$src = substr($src, 0, $spacePosition);
}
else
{
# Drop '>'
$src = substr($src, 0, strlen($src) - 1);
}
if ($src)
{
# Relative link, so add url before '/'
if (stripos($src, '/') === 0)
{
$src = $url[$counter].$src;
}
else
{
if (stripos($src, 'http://') !== 0 && stripos($src, 'https://') !== 0)
{
# Relative link, so add url and '/'
$src = $url[$counter].'/'.$src;
}
}
}
# Recreate $label with fixed image links
$label = '<img border="0" src='.$src.'>';
}
#########################
# Done with image links #
#########################
# Drop everything before the closing tag '</a>'
$contents = stristr($contents, $closingTag);
if ($href)
{
if (stripos($href, '/') === 0)
{
# Relative link, so add url before '/'
$href = $url[$counter].$href;
}
else
{
if (stripos($href, 'http://') !== 0 && stripos($href, 'https://') !== 0 &&
stripos($href, 'mailto:') !== 0 && stripos($href, 'ftp://') !== 0)
{
# Relative link, so add url and '/'
$href = $url[$counter].'/'.$href;
}
}
}
# Create array
$links['<a href='.$href.'>'.$label.'</a>'] = $href;
}
if ($unique === 1)
{
# Create final array with unique links
$results[$url[$counter]] = array_unique($links);
}
else
{
# Create final array with all links
$results[$url[$counter]] = $links;
}
# Reset links
$links = array();
# Increment counter
$counter++;
}
return $results;
}
/**
* Gets an array of links and shows them on an html page.
* If $simplePresentation = 0 it shows a live link and the actual link text.
* If $simplePresentation = 1 it shows only link text.
*
* @param array $results
* @param int $simplePresentation
*/
function showResults($results, $simplePresentation = 0)
{
if (!is_array($results))
{
printf('Invalid parameter for $results. The parameter must be an array.');
exit();
}
if ($simplePresentation !== 0 && $simplePresentation !== 1)
{
printf('Invalid parameter for $simplePresentation. The parameter must be either 1 or 0.');
exit();
}
$counter = 0;
$total = 0;
foreach ($results as $k => $v)
{
foreach ($v as $key => $value)
{
if ($simplePresentation === 0)
{
$counter++;
if ($counter === 1)
{
echo '<table align="center">';
echo '<tr><td colspan="2" bgcolor="Gray">Links found in <a href="'.$k.'">'.$k.'</a></td></tr>';
}
echo '<tr><td align="right">'.$key.'</td><td>'.$value.'</td></tr>';
if ($counter === count($v))
{
echo '</table>';
}
}
else
{
$counter++;
echo $value.'<br>';
}
}
$total = $total + $counter;
$counter = 0;
}
echo '<br>Total Number of Links: '.$total;
}
?>
<!--
Example
Un-comment showResults($links, 1); to see the alternative output.
Un-comment $links = linkGrabber($array, 0); to get duplicate links
-->
<html>
<head>
<body>
<?php
$array = array('http://www.weberdev.com', 'http://www.google.com', 'http://www.php.net', 'http://www.zend.com', 'http://www.phparch.com', 'http://www.bbc.co.uk');
$links = linkGrabber($array);
showResults($links);
#$links = linkGrabber($array, 0);
#showResults($links, 1);
?>
</body>
</head>
</html> | | |
|
| How to ifconfig down/up a list of IP's Categories : Arrays, Strings, Filesystem, PHP | | | Read a file with strings and create a new file with the
first half of each string Categories : PHP, Strings, Filesystem | | | Grab images from one or more URLs and save them to a specified local directory. Categories : PHP, Filesystem, Strings, Arrays | | | Compare two texts and display a block of text with the differences between them. Categories : PHP, PHP Classes, Filesystem, Strings, Arrays | | | Variable serialization and unserialization. Loading and saving variable structures
to and from file. Categories : Arrays, Filesystem, Variables, Strings, PHP | | | How to find the name of the current file? Categories : PHP, Filesystem, Strings | | | Working with files - return an array of files within a directory Categories : PHP, Strings, Variables, Filesystem | | | how can I read the entire contents of a file into a string? Categories : Filesystem, Strings, PHP | | | Working with files - putting file contents to a string / var Categories : PHP, Filesystem, Variables, Strings | | | Massreplace Categories : Filesystem, Regexps, Strings, PHP | | | Random Image Display Categories : PHP, Filesystem, Graphics, HTML and PHP | | | mysql_escape_string Categories : PHP, MySQL, Databases, Strings | | | PHP based Contact email form with multiple recipients, text file based, supports departments. Categories : PHP, Email, Beginner Guides, Filesystem | | | Allows you to parse a deliniated string and put the individual fields in a SELECT option in a form Categories : HTML, PHP, Strings | | | The toll booth Categories : PHP, Java Script, Filesystem | |
|
|
|