|
|
|
This function searches text or HTML files for URLs embedded in the text
and returns them in an array. CAUTION: If used as a web spider to collect
links on sites other than your own, use sleep()/usleep() to slow it down
and avoid overloading other servers! Development was done under WindowsNT,
but I can't see why it won't work under Unix.
<?php
# NOTES:
#
# Beware of setting the $depth parameter too high; this can cause lengthy
# searches and return very large result arrays! (Once again, CAUTION: If used
# as a web spider to collect links on sites other than your own, MODIFY THIS
# CLASS to use sleep()/usleep() to slow it down and avoid overloading other
# servers!)
#
# My main use for this function is to periodically scan web forum postings and
# summarize any URL mentions buried deep in the message threads. On Unix, this
# can be set up as a cron process; under WinNT, the command-line "AT" command
# or WinAT can be used for scheduling.
#
# Please excuse the formatting; I collapsed all tabs to single spaces, so some
# things aren't aligned nicely.
#
# If you have any suggestions/comments, email me: sbedberg@ucdavis.edu
#
# AND NOW, THE ACTUAL CODE:
?>
<?php
#------------------------------------------------------------------------------
# CLASS:
# link_harvester
# Scans text or HTML documents for URLs, and returns a list.
#------------------------------------------------------------------------------
# Notes:
# If the given filename is an actual file or a URL, it will scan that file;
# if it is a directory, it will look for and scan ALL files in that
# directory.
#
# If scanning a text file, will only recognize URLs with the "schema://..."
# form; newlines, tabs, whitespace and some punctuation (',",<,>) terminate
# it. Any links that are followed (depth > 0) will be followed as HTML, not
# as text files. The doc title array will contain the the first line of the
# file, up to 80 chars. Lastly, all links are assumed to be non-local (too
# hard to determine otherwise).
#
# If scanning HTML, the routine only looks at HREFs. Anything without an
# explicit scheme is - as is specified by the W3 standard - assumed to be
# a reference to a local page.
#
# Public Methods:
# harvest($filename) Wrapper for _harvest()
# $filename This is the initial filename/URL to start scanning
#
# Private Methods:
# _harvest($filename, $depth, $as_text)
# Actual search routine
# $filename As above.
# $depth Depth of documents to search (see public vars, below)
# $as_text Determines how to search $filename (see public vars)
#
# _is_schema($string) Returns TRUE if a valid schema is found at the head
# of string (whitespace characters not permitted).
#
#------------------------------------------------------------------------------
# (version 0.9) written by S. Edberg (sbedberg@ucdavis.edu), March 1998.
#
# Change History:
#------------------------------------------------------------------------------
# TODO?
# BUGFIX: If (depth > 0 and local searching enabled), the base URL needs to
# be prepended to the $url agrument to _harvest($url, $depth-1); current-
# ly, it returns a fail-to-open error.
# Have optional list of URLs to include or exclude
# Duplicate URL removal option
# Add timeout options (for entire script? for attempts to open URLs found?)
# Allow wildcards in file specification
# Optionally traverse subdirectories if a directory name is specified
#------------------------------------------------------------------------------
class link_harvester {
# Public variables: Input
var $depth = 0; # Link depth to search (0= $filename only, 1=1 level deep, ...)
var $as_text = 1; # If 1 (TRUE) scans docs as text; 0 (FALSE) - assumes HTML
var $local_sw = 1; # If TRUE, collect local links; ignore local otherwise
var $remote_sw = 1; # If TRUE, collect nonlocal link info; ignore otherwise
# Public variables: Output
var $attempts = 0; # No. of files attempted
var $failures = 0; # No. of files unreachable
var $lasterror = ''; # Last error message generated (excluding unreachable files)
var $links; # Array of links harvested
var $titles; # Array of titles ($title[x] is for $link[x])
var $source; # Array of source URLs of links($source[x] for $link[x])
# Private variables:
var $MAXLINELEN = 2048; # Maximum length of input line.
# Public methods
function harvest($filename) {
$ftype = filetype($filename);
if ($this->_is_schema($filename) || $ftype == 'file') {
$this->_harvest($filename, $this->depth, $this->as_text);
} elseif ($ftype == 'dir') {
if ($dh = opendir($filename)) {
# First, add a trailing slash to directory name if there is none
$dname = $filename.(ereg( '[\\/]$', $filename) ? "" : "/");
while ($fname = readdir($dh)) {
if (filetype($dname.$fname) == 'file')
{ $this->_harvest($dname.$fname, $this->depth, $this->as_text); }
}
closedir($dh);
} else {
$this->lasterror = 'Unable to open directory: '.$filename;
}
} else {
$this->lasterror = 'Unknown filetype for: '.$filename;
}
}
# Private methods
function _harvest($filename,$depth,$as_text) {
$this->attempts++;
if ($fp = @fopen($filename, "r")) {
$in_url = 0; # state variable: are we currently in a URL?
$in_link = 0; # state variable; are we in linked text?
$first_time = 1;
$getfunc = ($as_text ? "fgetss" : "fgets");
# ...strip HTML, PHP tags if reading as text file.
$prev_line_fragment = '';
while($line = $getfunc($fp, $this->MAXLINELEN)) {
$remaining_line = $prev_line_fragment.$line;
$prev_line_fragment = '';
# ...prepend end of previous line in case there was a read-line
# break in the middle of a tag
if ($as_text && $first_time) { $title = substr($line,0,80); }
while ($remaining_line != '') {
if ($as_text) {
$temp = split( '://', $remaining_line, 2);
if (count($temp) > 1) { # '://' found; now extract schema
$remaining_line = $temp[1];
$temp2 = split( '[^[:alpha:]]', strrev($temp[0]), 2);
$url_pre = strrev($temp2[0]); # this gets schema
if ($url_pre != '') { # this ensures a non-null schema
$temp3 = split( "[[:space:]'\"<>]", $remaining_line, 2);
$url_post = $temp3[0];
$remaining_line = $temp3[1];
$url = $url_pre. '://'.$url_post;
$this->links[] = $url;
$this->titles[] = "FROM: $title";
$this->source[] = $filename;
if ($depth > 0) { $this->_harvest($url, $depth-1, 0); } # recurse?
}
} else { # '://' not found; go to next line
$prev_line_fragment = $remaining_line;
$remaining_line = '';
}
} else { # as HTML
if (!$in_url) { # look for start tag
$temp = split( '<[[:space:]]*[Aa][[:space:]]*[Hh][Rr][Ee][Ff]'.
'[[:space:]]*=[[:space:]]*"[[:space:]]*',
$remaining_line,
2);
if (count($temp) > 1) {
# Found; $temp[1] contains rest of line, starting at schema
$in_url = 1;
$in_link = 0; # state varb: are we in linked text? (else, in URL)
$url = '';
$title = '';
$remaining_line = $temp[1]; # start search over with new, truncated line.
} else { # not found; go on to next line
$prev_line_fragment = $remaining_line;
$remaining_line = '';
}
} elseif ($in_url && !$in_link) { # look for doublequote & '>' at end of HREF tag
$temp = split( '[[:space:]]*"[[:space:]]*>[[:space:]]*',
$remaining_line,
2);
$url .= $temp[0];
if (count($temp) > 1) {
# Found; $temp[1] contains rest of line after HREF="..."
# Now, check if schema is found (determines local/nonlocal reference)
$is_schema = $this->_is_schema($url);
if (($is_schema && $this->remote_sw) || (!$is_schema && $this->local_sw)) {
$in_link = 1; # OK...continue with accumulation.
$remaining_line = $temp[1];
$title = '';
} else { # Ignore this; reset flags & continue.
$in_link = 0;
$in_url = 0;
$remaining_line = $temp[1];
}
} else { # end of URL not found on this line; go on to next
$prev_line_fragment = $remaining_line;
$remaining_line = '';
}
} else { # $in_url AND $in_link: look for </A> tag after linked text
$temp = split( '[[:space:]]*</[Aa]>[[:space:]]*', $remaining_line, 2);
$title .= $temp[0];
if (count($temp) > 1) { # found end tag
$this->links[] = $url;
$this->titles[] = $title;
$this->source[] = $filename;
$in_url = 0;
$in_link = 0;
$remaining_line = $temp[1];
if ($depth > 0) { $this->_harvest($url, $depth-1, 0); } # recurse?
} else { # end of title not found on this line; go on to next
$prev_line_fragment = $remaining_line;
$remaining_line = '';
}
}
} # end if ($as_text)
} # end while ($remaining_line != '')
$first_time = 0;
}
fclose($fp);
if (!$as_text && $in_url) {
$this->links[] = $url;
$this->titles[] = ( $in_link ? $title : '' );
$this->source[] = $filename;
}
} else {
$this->failures++;
}
}
function _is_schema($string) {
$x = eregi( '^[[:alpha:]]+://', $string);
return($x);
}
};
?> |
|
| Takes an array and returns a string, suitable for inputing in an SQL statement
Categories : Arrays, Strings, PHP | | | columned txt file to array()? Categories : Arrays, Strings, Regexps, PHP | | | How to ifconfig down/up a list of IP's Categories : Arrays, Strings, Filesystem, PHP | | | Grab images from one or more URLs and save them to a specified local directory. Categories : PHP, Filesystem, Strings, Arrays | | | Variable serialization and unserialization. Loading and saving variable structures
to and from file. Categories : Arrays, Filesystem, Variables, Strings, PHP | | | Compare two texts and display a block of text with the differences between them. Categories : PHP, PHP Classes, Filesystem, Strings, Arrays | | | Function to create a separated list Categories : PHP, Arrays, Strings | | | WWW interface to Unix Manual(phpMan)
Categories : Program Execution, Strings, Arrays, PHP | | | Get TemplateMonster data Categories : Arrays, Ecommerce, PHP, Strings | | | How to Get a character array from a string Categories : PHP, Strings, Arrays | | | Can the word DO be used in arrays? Categories : Arrays, PHP, Strings | | | clearing variables in php3 Categories : Variables, Arrays, PHP | | | Avoiding or Detecting high bit characters in a string. Useful when you want to create a valid RSS feed Categories : PHP, Strings, Unicode, Regexps, Rich Site Summary (RSS) | | | Select with current month Categories : PHP, HTML and PHP, Date Time, Arrays | | | LDAP Categories : Initials, General | |
| | | | Robert Breker wrote :132
Newer Version aviable?
| |
|
|
|