001/*
002 * Copyright © 2012, 2013, 2014 Royal Botanic Gardens, Kew.
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
005 *
006 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
007 *
008 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
009 */
010package org.kew.rmf.transformers.botany;
011
012import java.util.regex.Pattern;
013
014import org.kew.rmf.transformers.StripNonAsciiAlphabeticCharactersTransformer;
015import org.kew.rmf.transformers.Transformer;
016
017/**
018 * This transformer canonicalises epithets.
019 */
020public class EpithetTransformer implements Transformer {
021
022        private StripNonAsciiAlphabeticCharactersTransformer stripNonAsciiAlphabetic;
023
024        private Pattern ana = Pattern.compile("(?<=[^i])ana$");
025        private Pattern aef = Pattern.compile("aef");
026        private Pattern colus = Pattern.compile("colus$");
027        private Pattern us = Pattern.compile("us$");
028        private Pattern um = Pattern.compile("um$");
029        private Pattern on = Pattern.compile("on$");
030        private Pattern iae = Pattern.compile("iae$");
031        private Pattern ei = Pattern.compile("ei$");
032        private Pattern ae = Pattern.compile("ae$");
033        private Pattern ii = Pattern.compile("ii$");
034        private Pattern ioi = Pattern.compile("ioi$");
035
036        public EpithetTransformer() {
037                stripNonAsciiAlphabetic = new StripNonAsciiAlphabeticCharactersTransformer();
038                stripNonAsciiAlphabetic.setReplacement("");
039        }
040
041        @Override
042        public String transform(String s) {
043                if (s == null) return null;
044                s = stripNonAsciiAlphabetic.transform(s);
045
046                s = ana.matcher(s).replaceFirst("iana");
047                s = aef.matcher(s).replaceFirst("if");
048                s = colus.matcher(s).replaceFirst("cola");
049                s = us.matcher(s).replaceFirst("a");
050                s = um.matcher(s).replaceFirst("a");
051                s = on.matcher(s).replaceFirst("a");
052                s = iae.matcher(s).replaceFirst("i");
053                s = ei.matcher(s).replaceFirst("i");
054                s = ae.matcher(s).replaceFirst("i");
055                s = ii.matcher(s).replaceFirst("i");
056                s = ioi.matcher(s).replaceFirst("oi");
057                s = s.replace("j", "i");
058                s = s.replace("y", "i");
059                s = s.replace("-", "");
060                s = s.replace("'", "");
061
062                /*
063                if (value.endsWith("anus") && !value.endsWith("ianus"))
064                        value = value.replaceFirst("anus$", "ianus");
065                if (value.endsWith("anum") && !value.endsWith("ianum"))
066                        value = value.replaceFirst("anum$", "ianum");
067                if (value.endsWith("arum") && !value.endsWith("iarum"))
068                        value = value.replaceFirst("arum$", "iarum");
069                if (value.endsWith("orum") && !value.endsWith("iorum"))
070                        value = value.replaceFirst("orum$", "iorum");
071                */
072
073                return s;
074        }
075}