|
|
|
#!/usr/local/bin/php -q
<?php
#PHP implementation of the Porter Stemming Algorithm
#Written by Iain Argent for Complinet Ltd., 17/2/00
#Translated from the PERL version at http://www.muscat.com/~martin/p.txt
#Version 1.1 (Includes British English endings)
#--Reduces words to their base stem for search engines and indexing
function stem($word) {
$step2list=array(
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize',
'iser'=>'ise', 'bli'=>'ble',
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize',
'isation'=>'ise', 'ation'=>'ate',
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous',
'aliti'=>'al',
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
);
$step3list=array(
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'alise'=>'al', 'iciti'=>'ic', 'ical'=>'ic',
'ful'=>'', 'ness'=>''
);
$c = "[^aeiou]"; # consonant
$v = "[aeiouy]"; # vowel
$C = "${c}[^aeiouy]*"; # consonant sequence
$V = "${v}[aeiou]*"; # vowel sequence
$mgr0 = "^(${C})?${V}${C}"; # [C]VC... is m>0
$meq1 = "^(${C})?${V}${C}(${V})?" . '$'; # [C]VC[V] is m=1
$mgr1 = "^(${C})?${V}${C}${V}${C}"; # [C]VCVC... is m>1
$_v = "^(${C})?${v}"; # vowel in stem
if (strlen($word)<3) return $word;
$word=preg_replace("/^y/", "Y", $word);
#Step 1a
$word=preg_replace("/(ss|i)es$/", "\\1", $word); # sses-> ss, ies->es
$word=preg_replace("/([^s])s$/", "\\1", $word); # ss->ss but s->null
#Step 1b
if (preg_match("/eed$/", $word)) {
$stem=preg_replace("/eed$/", "", $word);
if (ereg("$mgr0", $stem)) {
$word=preg_replace("/.$/", "", $word);
}
}
elseif (preg_match("/(ed|ing)$/", $word)) {
$stem=preg_replace("/(ed|ing)$/", "", $word);
if (preg_match("/$_v/", $stem)) {
$word=$stem;
if (preg_match("/(at|bl|iz|is)$/", $word)) {
$word=preg_replace("/(at|bl|iz|is)$/", "\\1e", $word);
}
elseif (preg_match("/([^aeiouylsz])\\1$/", $word)) {
$word=preg_replace("/.$/", "", $word);
}
elseif (preg_match("/^${C}${v}[^aeiouwxy]$/", $word)) {
$word.="e";
}
}
}
#Step 1c (weird rule)
if (preg_match("/y$/", $word)) {
$stem=preg_replace("/y$/", "", $word);
if (preg_match("/$_v/", $stem))
$word=$stem."i";
}
#Step 2
if
(preg_match("/(ational|tional|enci|anci|izer|iser|bli|alli|entli|eli|ousli|ization|isation|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/",
$word, $matches)) {
$stem=preg_replace("/
(ational|tional|enci|anci|izer|iser|bli|alli|entli|eli|ousli|ization|isation|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/",
"", $word);
$suffix=$matches[1];
if (preg_match("/$mgr0/", $stem)) {
$word=$stem.$step2list[$suffix];
}
}
#Step 3
if (preg_match("/(icate|ative|alize|alise|iciti|ical|ful|ness)$/", $word, $matches)) {
$stem=preg_replace("/(icate|ative|alize|alise|iciti|ical|ful|ness)$/", "", $word);
$suffix=$matches[1];
if (preg_match("/$mgr0/", $stem)) {
$word=$stem.$step3list[$suffix];
}
}
#Step 4
if
(preg_match("/(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize|ise)$/",
$word, $matches)) {
$stem=preg_replace("/(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize|ise)$/",
"", $word);
$suffix=$matches[1];
if (preg_match("/$mgr1/", $stem)) {
$word=$stem;
}
}
elseif (preg_match("/(s|t)ion$/", $word)) {
$stem=preg_replace("/(s|t)ion$/", "\\1", $word);
if (preg_match("/$mgr1/", $stem)) $word=$stem;
}
#Step 5
if (preg_match("/e$/", $word, $matches)) {
$stem=preg_replace("/e$/", "", $word);
if (preg_match("/$mgr1/", $stem) |
(preg_match("/$meq1/", $stem) &
~preg_match("/^${C}${v}[^aeiouwxy]$/", $stem))) {
$word=$stem;
}
}
if (preg_match("/ll$/", $word) & preg_match("/$mgr1/", $word)) $word=preg_replace("/.$/", "",
$word);
# and turn initial Y back to y
preg_replace("/^Y/", "y", $word);
return $word;
}
######Example
echo("demutualisation --> ".stem("demutualisation");
?>
|
|
| textwrap fill-paragraph (justification) Categories : Strings, PHP, Algorithms | | | How to control the number of decimal places when outputting numbers. Categories : PHP, Strings, Variables | | | Produces browser-safe strings while preserving HTML tags. Categories : Strings, HTTP, PHP, HTML and PHP | | | How to Generate a random 8 character string in php3? Categories : PHP, Algorithms | | | Making sure a string containes only digits or no digits. Categories : Strings, PHP, Regexps | | | Filter - A simple class that lets you use multiple functions to create custom filters. Categories : PHP, PHP Classes, Strings | | | Function to convert Arabic numbers into Roman Numerals Categories : Algorithms, PHP, Date Time | | | Library of math functions to expand the functionality of PHP3. Version 1.2.1 fixes a major problem with the gcd function.
Categories : Algorithms, PHP, Math. | | | String Replacement and speed consideration
Categories : PHP, Strings, Regexps | | | Adding dashes to credit card numbers Categories : Strings, Credit Cards, PHP | | | Credit Card Identification and Validation Class - The credit_card class provides methods for cleaning, validating and identifying the type of credit card numbers. Categories : PHP, PHP Classes, Credit Cards, Ecommerce, Algorithms | | | Phorum, MySQL, Language, UK date format, MySQL UK Date format Categories : PHP, Date Time, Strings, MySQL, Databases | | | How to ifconfig down/up a list of IP's Categories : Arrays, Strings, Filesystem, PHP | | | How to avoid the slashes (\) when retriving information from
sql when MagicQuotes are on? Categories : Strings, PHP | | | Variable serialization and unserialization. Loading and saving variable structures
to and from file. Categories : Arrays, Filesystem, Variables, Strings, PHP | |
| | | | James Clarke wrote :677
This is an excellent script and has helped me enormously with a db keyword search by allowing a wildcard search on the stem of a word.
However, one small (and pedantic) point, in the example at the end, the last line should read:
either:
echo ("demutualisation --> ".stem("demutualisation"));
or:
echo "demutualisation --> ".stem("demutualisation");
otherwise you get a parse error.
Just a small thing but otherwise a really useful script.
| |
|
|