| /* |
| * Copyright 2001-2004 The Apache Software Foundation. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.commons.codec.language; |
| |
| import org.apache.commons.codec.EncoderException; |
| import org.apache.commons.codec.StringEncoder; |
| |
| /** |
| * Encodes a string into a metaphone value. |
| * <p> |
| * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. |
| * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. |
| * </p> |
| * <p> |
| * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p |
| * 39.</CITE> |
| * </p> |
| * |
| * @author Apache Software Foundation |
| * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $ |
| */ |
| public class Metaphone implements StringEncoder { |
| |
| /** |
| * Five values in the English language |
| */ |
| private String vowels = "AEIOU" ; |
| |
| /** |
| * Variable used in Metaphone algorithm |
| */ |
| private String frontv = "EIY" ; |
| |
| /** |
| * Variable used in Metaphone algorithm |
| */ |
| private String varson = "CSPTG" ; |
| |
| /** |
| * The max code length for metaphone is 4 |
| */ |
| private int maxCodeLen = 4 ; |
| |
| /** |
| * Creates an instance of the Metaphone encoder |
| */ |
| public Metaphone() { |
| super(); |
| } |
| |
| /** |
| * Find the metaphone value of a String. This is similar to the |
| * soundex algorithm, but better at finding similar sounding words. |
| * All input is converted to upper case. |
| * Limitations: Input format is expected to be a single ASCII word |
| * with only characters in the A - Z range, no punctuation or numbers. |
| * |
| * @param txt String to find the metaphone code for |
| * @return A metaphone code corresponding to the String supplied |
| */ |
| public String metaphone(String txt) { |
| boolean hard = false ; |
| if ((txt == null) || (txt.length() == 0)) { |
| return "" ; |
| } |
| // single character is itself |
| if (txt.length() == 1) { |
| return txt.toUpperCase() ; |
| } |
| |
| char[] inwd = txt.toUpperCase().toCharArray() ; |
| |
| StringBuffer local = new StringBuffer(40); // manipulate |
| StringBuffer code = new StringBuffer(10) ; // output |
| // handle initial 2 characters exceptions |
| switch(inwd[0]) { |
| case 'K' : |
| case 'G' : |
| case 'P' : /* looking for KN, etc*/ |
| if (inwd[1] == 'N') { |
| local.append(inwd, 1, inwd.length - 1); |
| } else { |
| local.append(inwd); |
| } |
| break; |
| case 'A': /* looking for AE */ |
| if (inwd[1] == 'E') { |
| local.append(inwd, 1, inwd.length - 1); |
| } else { |
| local.append(inwd); |
| } |
| break; |
| case 'W' : /* looking for WR or WH */ |
| if (inwd[1] == 'R') { // WR -> R |
| local.append(inwd, 1, inwd.length - 1); |
| break ; |
| } |
| if (inwd[1] == 'H') { |
| local.append(inwd, 1, inwd.length - 1); |
| local.setCharAt(0, 'W'); // WH -> W |
| } else { |
| local.append(inwd); |
| } |
| break; |
| case 'X' : /* initial X becomes S */ |
| inwd[0] = 'S'; |
| local.append(inwd); |
| break ; |
| default : |
| local.append(inwd); |
| } // now local has working string with initials fixed |
| |
| int wdsz = local.length(); |
| int n = 0 ; |
| |
| while ((code.length() < this.getMaxCodeLen()) && |
| (n < wdsz) ) { // max code size of 4 works well |
| char symb = local.charAt(n) ; |
| // remove duplicate letters except C |
| if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) { |
| n++ ; |
| } else { // not dup |
| switch(symb) { |
| case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : |
| if (n == 0) { |
| code.append(symb); |
| } |
| break ; // only use vowel if leading char |
| case 'B' : |
| if ( isPreviousChar(local, n, 'M') && |
| isLastChar(wdsz, n) ) { // B is silent if word ends in MB |
| break; |
| } |
| code.append(symb); |
| break; |
| case 'C' : // lots of C special cases |
| /* discard if SCI, SCE or SCY */ |
| if ( isPreviousChar(local, n, 'S') && |
| !isLastChar(wdsz, n) && |
| (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) { |
| break; |
| } |
| if (regionMatch(local, n, "CIA")) { // "CIA" -> X |
| code.append('X'); |
| break; |
| } |
| if (!isLastChar(wdsz, n) && |
| (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) { |
| code.append('S'); |
| break; // CI,CE,CY -> S |
| } |
| if (isPreviousChar(local, n, 'S') && |
| isNextChar(local, n, 'H') ) { // SCH->sk |
| code.append('K') ; |
| break ; |
| } |
| if (isNextChar(local, n, 'H')) { // detect CH |
| if ((n == 0) && |
| (wdsz >= 3) && |
| isVowel(local,2) ) { // CH consonant -> K consonant |
| code.append('K'); |
| } else { |
| code.append('X'); // CHvowel -> X |
| } |
| } else { |
| code.append('K'); |
| } |
| break ; |
| case 'D' : |
| if (!isLastChar(wdsz, n + 1) && |
| isNextChar(local, n, 'G') && |
| (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J |
| code.append('J'); n += 2 ; |
| } else { |
| code.append('T'); |
| } |
| break ; |
| case 'G' : // GH silent at end or before consonant |
| if (isLastChar(wdsz, n + 1) && |
| isNextChar(local, n, 'H')) { |
| break; |
| } |
| if (!isLastChar(wdsz, n + 1) && |
| isNextChar(local,n,'H') && |
| !isVowel(local,n+2)) { |
| break; |
| } |
| if ((n > 0) && |
| ( regionMatch(local, n, "GN") || |
| regionMatch(local, n, "GNED") ) ) { |
| break; // silent G |
| } |
| if (isPreviousChar(local, n, 'G')) { |
| hard = true ; |
| } else { |
| hard = false ; |
| } |
| if (!isLastChar(wdsz, n) && |
| (this.frontv.indexOf(local.charAt(n + 1)) >= 0) && |
| (!hard)) { |
| code.append('J'); |
| } else { |
| code.append('K'); |
| } |
| break ; |
| case 'H': |
| if (isLastChar(wdsz, n)) { |
| break ; // terminal H |
| } |
| if ((n > 0) && |
| (this.varson.indexOf(local.charAt(n - 1)) >= 0)) { |
| break; |
| } |
| if (isVowel(local,n+1)) { |
| code.append('H'); // Hvowel |
| } |
| break; |
| case 'F': |
| case 'J' : |
| case 'L' : |
| case 'M': |
| case 'N' : |
| case 'R' : |
| code.append(symb); |
| break; |
| case 'K' : |
| if (n > 0) { // not initial |
| if (!isPreviousChar(local, n, 'C')) { |
| code.append(symb); |
| } |
| } else { |
| code.append(symb); // initial K |
| } |
| break ; |
| case 'P' : |
| if (isNextChar(local,n,'H')) { |
| // PH -> F |
| code.append('F'); |
| } else { |
| code.append(symb); |
| } |
| break ; |
| case 'Q' : |
| code.append('K'); |
| break; |
| case 'S' : |
| if (regionMatch(local,n,"SH") || |
| regionMatch(local,n,"SIO") || |
| regionMatch(local,n,"SIA")) { |
| code.append('X'); |
| } else { |
| code.append('S'); |
| } |
| break; |
| case 'T' : |
| if (regionMatch(local,n,"TIA") || |
| regionMatch(local,n,"TIO")) { |
| code.append('X'); |
| break; |
| } |
| if (regionMatch(local,n,"TCH")) { |
| // Silent if in "TCH" |
| break; |
| } |
| // substitute numeral 0 for TH (resembles theta after all) |
| if (regionMatch(local,n,"TH")) { |
| code.append('0'); |
| } else { |
| code.append('T'); |
| } |
| break ; |
| case 'V' : |
| code.append('F'); break ; |
| case 'W' : case 'Y' : // silent if not followed by vowel |
| if (!isLastChar(wdsz,n) && |
| isVowel(local,n+1)) { |
| code.append(symb); |
| } |
| break ; |
| case 'X' : |
| code.append('K'); code.append('S'); |
| break ; |
| case 'Z' : |
| code.append('S'); break ; |
| } // end switch |
| n++ ; |
| } // end else from symb != 'C' |
| if (code.length() > this.getMaxCodeLen()) { |
| code.setLength(this.getMaxCodeLen()); |
| } |
| } |
| return code.toString(); |
| } |
| |
| private boolean isVowel(StringBuffer string, int index) { |
| return (this.vowels.indexOf(string.charAt(index)) >= 0); |
| } |
| |
| private boolean isPreviousChar(StringBuffer string, int index, char c) { |
| boolean matches = false; |
| if( index > 0 && |
| index < string.length() ) { |
| matches = string.charAt(index - 1) == c; |
| } |
| return matches; |
| } |
| |
| private boolean isNextChar(StringBuffer string, int index, char c) { |
| boolean matches = false; |
| if( index >= 0 && |
| index < string.length() - 1 ) { |
| matches = string.charAt(index + 1) == c; |
| } |
| return matches; |
| } |
| |
| private boolean regionMatch(StringBuffer string, int index, String test) { |
| boolean matches = false; |
| if( index >= 0 && |
| (index + test.length() - 1) < string.length() ) { |
| String substring = string.substring( index, index + test.length()); |
| matches = substring.equals( test ); |
| } |
| return matches; |
| } |
| |
| private boolean isLastChar(int wdsz, int n) { |
| return n + 1 == wdsz; |
| } |
| |
| |
| /** |
| * Encodes an Object using the metaphone algorithm. This method |
| * is provided in order to satisfy the requirements of the |
| * Encoder interface, and will throw an EncoderException if the |
| * supplied object is not of type java.lang.String. |
| * |
| * @param pObject Object to encode |
| * @return An object (or type java.lang.String) containing the |
| * metaphone code which corresponds to the String supplied. |
| * @throws EncoderException if the parameter supplied is not |
| * of type java.lang.String |
| */ |
| public Object encode(Object pObject) throws EncoderException { |
| if (!(pObject instanceof java.lang.String)) { |
| throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); |
| } |
| return metaphone((String) pObject); |
| } |
| |
| /** |
| * Encodes a String using the Metaphone algorithm. |
| * |
| * @param pString String object to encode |
| * @return The metaphone code corresponding to the String supplied |
| */ |
| public String encode(String pString) { |
| return metaphone(pString); |
| } |
| |
| /** |
| * Tests is the metaphones of two strings are identical. |
| * |
| * @param str1 First of two strings to compare |
| * @param str2 Second of two strings to compare |
| * @return true if the metaphones of these strings are identical, |
| * false otherwise. |
| */ |
| public boolean isMetaphoneEqual(String str1, String str2) { |
| return metaphone(str1).equals(metaphone(str2)); |
| } |
| |
| /** |
| * Returns the maxCodeLen. |
| * @return int |
| */ |
| public int getMaxCodeLen() { return this.maxCodeLen; } |
| |
| /** |
| * Sets the maxCodeLen. |
| * @param maxCodeLen The maxCodeLen to set |
| */ |
| public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; } |
| |
| } |