|
|
|
#!/usr/local/bin/php -q
<?php
#PHP implementation of the Porter Stemming Algorithm
#Written by Iain Argent for Complinet Ltd., 17/2/00
#Translated from the PERL version at http://www.muscat.com/~martin/p.txt
#Version 1.1 (Includes British English endings)
#--Reduces words to their base stem for search engines and indexing
function stem($word) {
$step2list=array(
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize',
'iser'=>'ise', 'bli'=>'ble',
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize',
'isation'=>'ise', 'ation'=>'ate',
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous',
'aliti'=>'al',
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
);
$step3list=array(
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'alise'=>'al', 'iciti'=>'ic', 'ical'=>'ic',
'ful'=>'', 'ness'=>''
);
$c = "[^aeiou]"; # consonant
$v = "[aeiouy]"; # vowel
$C = "${c}[^aeiouy]*"; # consonant sequence
$V = "${v}[aeiou]*"; # vowel sequence
$mgr0 = "^(${C})?${V}${C}"; # [C]VC... is m>0
$meq1 = "^(${C})?${V}${C}(${V})?" . '$'; # [C]VC[V] is m=1
$mgr1 = "^(${C})?${V}${C}${V}${C}"; # [C]VCVC... is m>1
$_v = "^(${C})?${v}"; # vowel in stem
if (strlen($word)<3) return $word;
$word=preg_replace("/^y/", "Y", $word);
#Step 1a
$word=preg_replace("/(ss|i)es$/", "\\1", $word); # sses-> ss, ies->es
$word=preg_replace("/([^s])s$/", "\\1", $word); # ss->ss but s->null
#Step 1b
if (preg_match("/eed$/", $word)) {
$stem=preg_replace("/eed$/", "", $word);
if (ereg("$mgr0", $stem)) {
$word=preg_replace("/.$/", "", $word);
}
}
elseif (preg_match("/(ed|ing)$/", $word)) {
$stem=preg_replace("/(ed|ing)$/", "", $word);
if (preg_match("/$_v/", $stem)) {
$word=$stem;
if (preg_match("/(at|bl|iz|is)$/", $word)) {
$word=preg_replace("/(at|bl|iz|is)$/", "\\1e", $word);
}
elseif (preg_match("/([^aeiouylsz])\\1$/", $word)) {
$word=preg_replace("/.$/", "", $word);
}
elseif (preg_match("/^${C}${v}[^aeiouwxy]$/", $word)) {
$word.="e";
}
}
}
#Step 1c (weird rule)
if (preg_match("/y$/", $word)) {
$stem=preg_replace("/y$/", "", $word);
if (preg_match("/$_v/", $stem))
$word=$stem."i";
}
#Step 2
if
(preg_match("/(ational|tional|enci|anci|izer|iser|bli|alli|entli|eli|ousli|ization|isation|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/",
$word, $matches)) {
$stem=preg_replace("/
(ational|tional|enci|anci|izer|iser|bli|alli|entli|eli|ousli|ization|isation|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/",
"", $word);
$suffix=$matches[1];
if (preg_match("/$mgr0/", $stem)) {
$word=$stem.$step2list[$suffix];
}
}
#Step 3
if (preg_match("/(icate|ative|alize|alise|iciti|ical|ful|ness)$/", $word, $matches)) {
$stem=preg_replace("/(icate|ative|alize|alise|iciti|ical|ful|ness)$/", "", $word);
$suffix=$matches[1];
if (preg_match("/$mgr0/", $stem)) {
$word=$stem.$step3list[$suffix];
}
}
#Step 4
if
(preg_match("/(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize|ise)$/",
$word, $matches)) {
$stem=preg_replace("/(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize|ise)$/",
"", $word);
$suffix=$matches[1];
if (preg_match("/$mgr1/", $stem)) {
$word=$stem;
}
}
elseif (preg_match("/(s|t)ion$/", $word)) {
$stem=preg_replace("/(s|t)ion$/", "\\1", $word);
if (preg_match("/$mgr1/", $stem)) $word=$stem;
}
#Step 5
if (preg_match("/e$/", $word, $matches)) {
$stem=preg_replace("/e$/", "", $word);
if (preg_match("/$mgr1/", $stem) |
(preg_match("/$meq1/", $stem) &
~preg_match("/^${C}${v}[^aeiouwxy]$/", $stem))) {
$word=$stem;
}
}
if (preg_match("/ll$/", $word) & preg_match("/$mgr1/", $word)) $word=preg_replace("/.$/", "",
$word);
# and turn initial Y back to y
preg_replace("/^Y/", "y", $word);
return $word;
}
######Example
echo("demutualisation --> ".stem("demutualisation");
?>
|
|
| textwrap fill-paragraph (justification) Categories : Strings, PHP, Algorithms | | | Dollar Serial Number Validator Categories : PHP, Security, Algorithms | | | Avoiding or Detecting high bit characters in a string. Useful when you want to create a valid RSS feed Categories : PHP, Strings, Unicode, Regexps, Rich Site Summary (RSS) | | | A very simple and efficient split bar the B-Z bar , for mysql and php ...
Tired of obfuscated code try this one ...
Categories : PHP, Databases, MySQL, Algorithms | | | Allows you to parse a deliniated string and put the individual fields in a SELECT option in a form Categories : HTML, PHP, Strings | | | Using data from a string. Categories : PHP, Strings, CURL | | | IPhider Obscure Any URL Anonymity connection lores obfuscation corporate survival. Categories : PHP, Algorithms, Security, URLs | | | function textwrap will wrap text to any desired width using <BR>\n as the default line break.
Default wrap width is 80 columns.
Categories : Strings, HTML and PHP, PHP | | | PHP Function to Encrypt/Decrypt a string without a known key. The string itself has his own different key for every character. Categories : PHP, Algorithms, Security, Authentication, Encryption | | | Adding dashes to credit card numbers Categories : Strings, Credit Cards, PHP | | | Credit Card Identification and Validation Class - The credit_card class provides methods for cleaning, validating and identifying the type of credit card numbers. Categories : PHP, PHP Classes, Credit Cards, Ecommerce, Algorithms | | | I need a trim function/regexp that will trim all " " from the ends of a string. Categories : Regexps, PHP, Strings | | | Check parameters validity. Paranoia was designed to check the validity of the parameters that a php page will receive after a form submission. It can be used to check the variables sent by POST or GET Categories : Algorithms, HTML and PHP, PHP, Variables | | | A very simple way to build and do a hierarchical html categories browser without javascript , just using html php and mySql
Categories : HTML and PHP, Databases, Algorithms, PHP, MySQL | | | mysql_escape_string Categories : PHP, MySQL, Databases, Strings | |
| | | | James Clarke wrote :677
This is an excellent script and has helped me enormously with a db keyword search by allowing a wildcard search on the stem of a word.
However, one small (and pedantic) point, in the example at the end, the last line should read:
either:
echo ("demutualisation --> ".stem("demutualisation"));
or:
echo "demutualisation --> ".stem("demutualisation");
otherwise you get a parse error.
Just a small thing but otherwise a really useful script.
| |
|
|
|