001// License: GPL. For details, see LICENSE file. 002package org.openstreetmap.josm.data.validation.tests; 003 004import static java.util.regex.Pattern.CASE_INSENSITIVE; 005import static java.util.regex.Pattern.UNICODE_CASE; 006import static org.openstreetmap.josm.tools.I18n.tr; 007 008import java.awt.geom.Point2D; 009import java.util.ArrayList; 010import java.util.Arrays; 011import java.util.HashMap; 012import java.util.List; 013import java.util.Locale; 014import java.util.Map; 015import java.util.regex.Matcher; 016import java.util.regex.Pattern; 017 018import org.openstreetmap.josm.data.osm.OsmPrimitive; 019import org.openstreetmap.josm.data.osm.Way; 020import org.openstreetmap.josm.data.validation.Severity; 021import org.openstreetmap.josm.data.validation.Test; 022import org.openstreetmap.josm.data.validation.TestError; 023import org.openstreetmap.josm.data.validation.util.ValUtil; 024import org.openstreetmap.josm.gui.progress.ProgressMonitor; 025import org.openstreetmap.josm.tools.MultiMap; 026import org.openstreetmap.josm.tools.Utils; 027 028/** 029 * Checks for similar named ways, symptom of a possible typo. It uses the 030 * Levenshtein distance to check for similarity 031 * 032 * @author frsantos 033 */ 034public class SimilarNamedWays extends Test { 035 036 protected static final int SIMILAR_NAMED = 701; 037 038 /** All ways, grouped by cells */ 039 private Map<Point2D, List<Way>> cellWays; 040 /** The already detected errors */ 041 private MultiMap<Way, Way> errorWays; 042 043 private final List<NormalizeRule> rules = new ArrayList<>(); 044 045 /** 046 * Constructor 047 */ 048 public SimilarNamedWays() { 049 super(tr("Similarly named ways"), 050 tr("This test checks for ways with similar names that may have been misspelled.")); 051 052 // FIXME: hardcode these rules for now. Replace them with preferences later 053 // See https://josm.openstreetmap.de/ticket/3733#comment:19 054 addRegExprRule("\\d+", "0"); // Highway 66 055 addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave 056 addRegExprRule("^[A-Z] ", "X"); // E Street 057 addSynonyms("east", "west", "north", "south"); 058 addSynonyms("first", "second", "third"); 059 } 060 061 @Override 062 public void startTest(ProgressMonitor monitor) { 063 super.startTest(monitor); 064 cellWays = new HashMap<>(1000); 065 errorWays = new MultiMap<>(); 066 } 067 068 @Override 069 public void endTest() { 070 cellWays = null; 071 errorWays = null; 072 super.endTest(); 073 } 074 075 @Override 076 public void visit(Way w) { 077 if (!w.isUsable()) 078 return; 079 080 String name = w.get("name"); 081 if (name == null || name.length() < 6) 082 return; 083 084 List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays); 085 for (List<Way> ways : theCellWays) { 086 for (Way w2 : ways) { 087 if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) { 088 continue; 089 } 090 091 String name2 = w2.get("name"); 092 if (name2 == null || name2.length() < 6) { 093 continue; 094 } 095 096 if (similaryName(name, name2)) { 097 List<OsmPrimitive> primitives = new ArrayList<>(2); 098 primitives.add(w); 099 primitives.add(w2); 100 errors.add(new TestError(this, Severity.WARNING, tr("Similarly named ways"), SIMILAR_NAMED, primitives)); 101 errorWays.put(w, w2); 102 } 103 } 104 ways.add(w); 105 } 106 } 107 108 /** 109 * Compute Levenshtein distance 110 * 111 * @param s First word 112 * @param t Second word 113 * @return The distance between words 114 */ 115 public static int getLevenshteinDistance(String s, String t) { 116 int[][] d; // matrix 117 int n; // length of s 118 int m; // length of t 119 int i; // iterates through s 120 int j; // iterates through t 121 char si; // ith character of s 122 char tj; // jth character of t 123 int cost; // cost 124 125 // Step 1 126 n = s.length(); 127 m = t.length(); 128 if (n == 0) 129 return m; 130 if (m == 0) 131 return n; 132 d = new int[n+1][m+1]; 133 134 // Step 2 135 for (i = 0; i <= n; i++) { 136 d[i][0] = i; 137 } 138 for (j = 0; j <= m; j++) { 139 d[0][j] = j; 140 } 141 142 // Step 3 143 for (i = 1; i <= n; i++) { 144 145 si = s.charAt(i - 1); 146 147 // Step 4 148 for (j = 1; j <= m; j++) { 149 150 tj = t.charAt(j - 1); 151 152 // Step 5 153 if (si == tj) { 154 cost = 0; 155 } else { 156 cost = 1; 157 } 158 159 // Step 6 160 d[i][j] = Utils.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost); 161 } 162 } 163 164 // Step 7 165 return d[n][m]; 166 } 167 168 /** 169 * Add a regular expression rule. 170 * @param regExpr the regular expression to search for 171 * @param replacement a string to replace with, which should match the expression. 172 */ 173 public void addRegExprRule(String regExpr, String replacement) { 174 rules.add(new RegExprRule(regExpr, replacement)); 175 } 176 177 /** 178 * Add a rule with synonym words. 179 * @param words words which are synonyms 180 */ 181 public void addSynonyms(String... words) { 182 for (String word : words) { 183 rules.add(new SynonymRule(word, words)); 184 } 185 } 186 187 /** 188 * Check if two names are similar, but not identical. First both names will be "normalized". 189 * Afterwards the Levenshtein distance will be calculated.<br> 190 * Examples for normalization rules:<br> 191 * <code>replaceAll("\\d+", "0")</code><br> 192 * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true 193 * @param name first name to compare 194 * @param name2 second name to compare 195 * @return true if the normalized names are different but only a "little bit" 196 */ 197 public boolean similaryName(String name, String name2) { 198 // check plain strings 199 int distance = getLevenshteinDistance(name, name2); 200 boolean similar = distance > 0 && distance <= 2; 201 202 // try all rules 203 for (NormalizeRule rule : rules) { 204 int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2)); 205 if (levenshteinDistance == 0) 206 // one rule results in identical names: identical 207 return false; 208 else if (levenshteinDistance <= 2) { 209 // 0 < distance <= 2 210 similar = true; 211 } 212 } 213 return similar; 214 } 215 216 public interface NormalizeRule { 217 218 /** 219 * Normalize the string by replacing parts. 220 * @param name name to normalize 221 * @return normalized string 222 */ 223 String normalize(String name); 224 } 225 226 public static class RegExprRule implements NormalizeRule { 227 private final Pattern regExpr; 228 private final String replacement; 229 230 public RegExprRule(String expression, String replacement) { 231 this.regExpr = Pattern.compile(expression); 232 this.replacement = replacement; 233 } 234 235 @Override 236 public String normalize(String name) { 237 return regExpr.matcher(name).replaceAll(replacement); 238 } 239 240 @Override 241 public String toString() { 242 return "replaceAll(" + regExpr + ", " + replacement + ')'; 243 } 244 } 245 246 public static class SynonymRule implements NormalizeRule { 247 248 private final String[] words; 249 private final Pattern regExpr; 250 private final String replacement; 251 252 public SynonymRule(String replacement, String[] words) { 253 this.replacement = replacement.toLowerCase(Locale.ENGLISH); 254 this.words = words; 255 256 // build regular expression for other words (for fast match) 257 StringBuilder expression = new StringBuilder(); 258 int maxLength = 0; 259 for (int i = 0; i < words.length; i++) { 260 if (words[i].length() > maxLength) { 261 maxLength = words[i].length(); 262 } 263 if (expression.length() > 0) { 264 expression.append('|'); 265 } 266 expression.append(Pattern.quote(words[i])); 267 } 268 this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE); 269 } 270 271 @Override 272 public String normalize(String name) { 273 // find first match 274 Matcher matcher = regExpr.matcher(name); 275 if (!matcher.find()) 276 return name; 277 278 int start = matcher.start(); 279 280 // which word matches? 281 String part = ""; 282 for (int i = 0; i < words.length; i++) { 283 String word = words[i]; 284 part = name.substring(start, start + word.length()); 285 if (word.equalsIgnoreCase(part)) { 286 break; 287 } 288 } 289 290 // replace the word 291 char[] newName = matcher.replaceFirst(replacement).toCharArray(); 292 293 // adjust case (replacement is not shorter than matching word!) 294 int minLength = Math.min(replacement.length(), part.length()); 295 for (int i = 0; i < minLength; i++) { 296 if (Character.isUpperCase(part.charAt(i))) { 297 newName[start + i] = Character.toUpperCase(newName[start + i]); 298 } 299 } 300 301 return new String(newName); 302 } 303 304 @Override 305 public String toString() { 306 return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')'; 307 } 308 } 309}