Package translate :: Package storage :: Module dtd
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.dtd

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2006 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """classes that hold units of .dtd files (dtdunit) or entire files (dtdfile) 
 23  these are specific .dtd files for localisation used by mozilla""" 
 24   
 25  from translate.storage import base 
 26  from translate.misc import quote 
 27   
 28  import re 
 29  import warnings 
 30  try: 
 31      from lxml import etree 
 32      import StringIO 
 33  except ImportError: 
 34      etree = None 
 35   
 36  labelsuffixes = (".label", ".title") 
 37  """Label suffixes: entries with this suffix are able to be comibed with accesskeys 
 38  found in in entries ending with L{accesskeysuffixes}""" 
 39  accesskeysuffixes = (".accesskey", ".accessKey", ".akey") 
 40  """Accesskey Suffixes: entries with this suffix may be combined with labels 
 41  ending in L{labelsuffixes} into accelerator notation""" 
 42   
43 -def quotefordtd(source):
44 if '"' in source: 45 if "'" in source: 46 return "'" + source.replace("'", ''') + "'" 47 else: 48 return quote.singlequotestr(source) 49 else: 50 return quote.quotestr(source)
51
52 -def unquotefromdtd(source):
53 """unquotes a quoted dtd definition""" 54 # extract the string, get rid of quoting 55 if len(source) == 0: 56 source = '""' 57 quotechar = source[0] 58 extracted, quotefinished = quote.extractwithoutquotes(source, quotechar, quotechar, allowreentry=False) 59 if quotechar == "'" and "'" in extracted: 60 extracted = extracted.replace("'", "'") 61 # the quote characters should be the first and last characters in the string 62 # of course there could also be quote characters within the string; not handled here 63 return extracted
64
65 -def removeinvalidamps(name, value):
66 """Find and remove ampersands that are not part of an entity definition. 67 68 A stray & in a DTD file can break an applications ability to parse the file. In Mozilla 69 localisation this is very important and these can break the parsing of files used in XUL 70 and thus break interface rendering. Tracking down the problem is very difficult, 71 thus by removing potential broken & and warning the users we can ensure that the output 72 DTD will always be parsable. 73 74 @type name: String 75 @param name: Entity name 76 @type value: String 77 @param value: Entity text value 78 @rtype: String 79 @return: Entity value without bad ampersands 80 """ 81 def is_valid_entity_name(name): 82 """Check that supplied L{name} is a valid entity name""" 83 if name.replace('.', '').isalnum(): 84 return True 85 elif name[0] == '#' and name[1:].isalnum(): 86 return True 87 return False
88 89 amppos = 0 90 invalid_amps = [] 91 while amppos >= 0: 92 amppos = value.find("&", amppos) 93 if amppos != -1: 94 amppos += 1 95 semipos = value.find(";", amppos) 96 if semipos != -1: 97 if is_valid_entity_name(value[amppos:semipos]): 98 continue 99 invalid_amps.append(amppos-1) 100 if len(invalid_amps) > 0: 101 warnings.warn("invalid ampersands in dtd entity %s" % (name)) 102 adjustment = 0 103 for amppos in invalid_amps: 104 value = value[:amppos-adjustment] + value[amppos-adjustment+1:] 105 adjustment += 1 106 return value 107
108 -class dtdunit(base.TranslationUnit):
109 """this class represents an entity definition from a dtd file (and possibly associated comments)"""
110 - def __init__(self, source=""):
111 """construct the dtdunit, prepare it for parsing""" 112 super(dtdunit, self).__init__(source) 113 self.comments = [] 114 self.unparsedlines = [] 115 self.incomment = False 116 self.inentity = False 117 self.entity = "FakeEntityOnlyForInitialisationAndTesting" 118 self.source = source
119 120 # Note that source and target are equivalent for monolingual units
121 - def setsource(self, source):
122 """Sets the definition to the quoted value of source""" 123 self.definition = quotefordtd(source) 124 self._rich_source = None
125
126 - def getsource(self):
127 """gets the unquoted source string""" 128 return unquotefromdtd(self.definition)
129 source = property(getsource, setsource) 130
131 - def settarget(self, target):
132 """Sets the definition to the quoted value of target""" 133 if target is None: 134 target = "" 135 self.definition = quotefordtd(target) 136 self._rich_target = None
137
138 - def gettarget(self):
139 """gets the unquoted target string""" 140 return unquotefromdtd(self.definition)
141 target = property(gettarget, settarget) 142
143 - def isnull(self):
144 """returns whether this dtdunit doesn't actually have an entity definition""" 145 # for dtds, we currently return a blank string if there is no .entity (==location in other files) 146 # TODO: this needs to work better with base class expectations 147 return self.entity is None
148
149 - def parse(self, dtdsrc):
150 """read the first dtd element from the source code into this object, return linesprocessed""" 151 self.comments = [] 152 # make all the lists the same 153 self.locfilenotes = self.comments 154 self.locgroupstarts = self.comments 155 self.locgroupends = self.comments 156 self.locnotes = self.comments 157 # self.locfilenotes = [] 158 # self.locgroupstarts = [] 159 # self.locgroupends = [] 160 # self.locnotes = [] 161 # self.comments = [] 162 self.entity = None 163 self.definition = '' 164 if not dtdsrc: 165 return 0 166 lines = dtdsrc.split("\n") 167 linesprocessed = 0 168 comment = "" 169 for line in lines: 170 line += "\n" 171 linesprocessed += 1 172 # print "line(%d,%d): " % (self.incomment,self.inentity),line[:-1] 173 if not self.incomment: 174 if (line.find('<!--') != -1): 175 self.incomment = True 176 self.continuecomment = False 177 # now work out the type of comment, and save it (remember we're not in the comment yet) 178 (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0) 179 if comment.find('LOCALIZATION NOTE') != -1: 180 l = quote.findend(comment,'LOCALIZATION NOTE') 181 while (comment[l] == ' '): 182 l += 1 183 if comment.find('FILE', l) == l: 184 self.commenttype = "locfile" 185 elif comment.find('BEGIN', l) == l: 186 self.commenttype = "locgroupstart" 187 elif comment.find('END', l) == l: 188 self.commenttype = "locgroupend" 189 else: 190 self.commenttype = "locnote" 191 else: 192 # plain comment 193 self.commenttype = "comment" 194 #FIXME: bloody entity might share a line with something important 195 elif not self.inentity and re.search("%.*;", line): 196 # now work out the type of comment, and save it (remember we're not in the comment yet) 197 self.comments.append(("comment", line)) 198 line = "" 199 continue 200 201 if self.incomment: 202 # some kind of comment 203 (comment, self.incomment) = quote.extract(line, "<!--", "-->", None, self.continuecomment) 204 # print "comment(%d,%d): " % (self.incomment,self.continuecomment),comment 205 self.continuecomment = self.incomment 206 # strip the comment out of what will be parsed 207 line = line.replace(comment, "", 1) 208 # add a end of line of this is the end of the comment 209 if not self.incomment: 210 if line.isspace(): 211 comment += line 212 line = '' 213 else: 214 comment += '\n' 215 # check if there's actually an entity definition that's commented out 216 # TODO: parse these, store as obsolete messages 217 # if comment.find('<!ENTITY') != -1: 218 # # remove the entity from the comment 219 # comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1) 220 # depending on the type of comment (worked out at the start), put it in the right place 221 # make it record the comment and type as a tuple 222 commentpair = (self.commenttype, comment) 223 if self.commenttype == "locfile": 224 self.locfilenotes.append(commentpair) 225 elif self.commenttype == "locgroupstart": 226 self.locgroupstarts.append(commentpair) 227 elif self.commenttype == "locgroupend": 228 self.locgroupends.append(commentpair) 229 elif self.commenttype == "locnote": 230 self.locnotes.append(commentpair) 231 elif self.commenttype == "comment": 232 self.comments.append(commentpair) 233 234 if not self.inentity and not self.incomment: 235 entitypos = line.find('<!ENTITY') 236 if entitypos != -1: 237 self.inentity = True 238 beforeentity = line[:entitypos].strip() 239 if beforeentity.startswith("#"): 240 self.hashprefix = beforeentity 241 self.entitypart = "start" 242 else: 243 self.unparsedlines.append(line) 244 245 if self.inentity: 246 if self.entitypart == "start": 247 # the entity definition 248 e = quote.findend(line,'<!ENTITY') 249 line = line[e:] 250 self.entitypart = "name" 251 self.entitytype = "internal" 252 if self.entitypart == "name": 253 e = 0 254 while (e < len(line) and line[e].isspace()): 255 e += 1 256 self.entity = '' 257 if (e < len(line) and line[e] == '%'): 258 self.entitytype = "external" 259 self.entityparameter = "" 260 e += 1 261 while (e < len(line) and line[e].isspace()): 262 e += 1 263 while (e < len(line) and not line[e].isspace()): 264 self.entity += line[e] 265 e += 1 266 while (e < len(line) and line[e].isspace()): 267 e += 1 268 if self.entity: 269 if self.entitytype == "external": 270 self.entitypart = "parameter" 271 else: 272 self.entitypart = "definition" 273 # remember the start position and the quote character 274 if e == len(line): 275 self.entityhelp = None 276 e = 0 277 continue 278 elif self.entitypart == "definition": 279 self.entityhelp = (e, line[e]) 280 self.instring = False 281 if self.entitypart == "parameter": 282 while (e < len(line) and line[e].isspace()): 283 e += 1 284 paramstart = e 285 while (e < len(line) and line[e].isalnum()): 286 e += 1 287 self.entityparameter += line[paramstart:e] 288 while (e < len(line) and line[e].isspace()): 289 e += 1 290 line = line[e:] 291 e = 0 292 if not line: 293 continue 294 if line[0] in ('"', "'"): 295 self.entitypart = "definition" 296 self.entityhelp = (e, line[e]) 297 self.instring = False 298 if self.entitypart == "definition": 299 if self.entityhelp is None: 300 e = 0 301 while (e < len(line) and line[e].isspace()): 302 e += 1 303 if e == len(line): 304 continue 305 self.entityhelp = (e, line[e]) 306 self.instring = False 307 # actually the lines below should remember instring, rather than using it as dummy 308 e = self.entityhelp[0] 309 if (self.entityhelp[1] == "'"): 310 (defpart, self.instring) = quote.extract(line[e:], "'", "'", startinstring=self.instring, allowreentry=False) 311 elif (self.entityhelp[1] == '"'): 312 (defpart, self.instring) = quote.extract(line[e:], '"', '"', startinstring=self.instring, allowreentry=False) 313 else: 314 raise ValueError("Unexpected quote character... %r" % (self.entityhelp[1])) 315 # for any following lines, start at the beginning of the line. remember the quote character 316 self.entityhelp = (0, self.entityhelp[1]) 317 self.definition += defpart 318 if not self.instring: 319 self.inentity = False 320 break 321 322 # uncomment this line to debug processing 323 if 0: 324 for attr in dir(self): 325 r = repr(getattr(self, attr)) 326 if len(r) > 60: 327 r = r[:57]+"..." 328 self.comments.append(("comment", "self.%s = %s" % (attr, r) )) 329 return linesprocessed
330
331 - def __str__(self):
332 """convert to a string. double check that unicode is handled somehow here""" 333 source = self.getoutput() 334 if isinstance(source, unicode): 335 return source.encode(getattr(self, "encoding", "UTF-8")) 336 return source
337
338 - def getoutput(self):
339 """convert the dtd entity back to string form""" 340 lines = [] 341 lines.extend([comment for commenttype, comment in self.comments]) 342 lines.extend(self.unparsedlines) 343 if self.isnull(): 344 result = "".join(lines) 345 return result.rstrip() + "\n" 346 # for f in self.locfilenotes: yield f 347 # for ge in self.locgroupends: yield ge 348 # for gs in self.locgroupstarts: yield gs 349 # for n in self.locnotes: yield n 350 if len(self.entity) > 0: 351 if getattr(self, 'entitytype', None) == 'external': 352 entityline = '<!ENTITY % '+self.entity+' '+self.entityparameter+' '+self.definition+'>' 353 else: 354 entityline = '<!ENTITY '+self.entity+' '+self.definition+'>' 355 if getattr(self, 'hashprefix', None): 356 entityline = self.hashprefix + " " + entityline 357 if isinstance(entityline, unicode): 358 entityline = entityline.encode('UTF-8') 359 lines.append(entityline+'\n') 360 return "".join(lines)
361
362 -class dtdfile(base.TranslationStore):
363 """this class represents a .dtd file, made up of dtdunits""" 364 UnitClass = dtdunit
365 - def __init__(self, inputfile=None):
366 """construct a dtdfile, optionally reading in from inputfile""" 367 base.TranslationStore.__init__(self, unitclass = self.UnitClass) 368 self.filename = getattr(inputfile, 'name', '') 369 if inputfile is not None: 370 dtdsrc = inputfile.read() 371 self.parse(dtdsrc) 372 self.makeindex()
373
374 - def parse(self, dtdsrc):
375 """read the source code of a dtd file in and include them as dtdunits in self.units""" 376 start = 0 377 end = 0 378 lines = dtdsrc.split("\n") 379 while end < len(lines): 380 if (start == end): 381 end += 1 382 foundentity = False 383 while end < len(lines): 384 if end >= len(lines): 385 break 386 if lines[end].find('<!ENTITY') > -1: 387 foundentity = True 388 if foundentity and re.match("[\"']\s*>", lines[end]): 389 end += 1 390 break 391 end += 1 392 # print "processing from %d to %d" % (start,end) 393 394 linesprocessed = 1 # to initialise loop 395 while linesprocessed >= 1: 396 newdtd = dtdunit() 397 try: 398 linesprocessed = newdtd.parse("\n".join(lines[start:end])) 399 if linesprocessed >= 1 and (not newdtd.isnull() or newdtd.unparsedlines): 400 self.units.append(newdtd) 401 except Exception, e: 402 warnings.warn("%s\nError occured between lines %d and %d:\n%s" % (e, start+1, end, "\n".join(lines[start:end]))) 403 start += linesprocessed
404
405 - def __str__(self):
406 """convert to a string. double check that unicode is handled somehow here""" 407 source = self.getoutput() 408 if not self._valid_store(): 409 warnings.warn("DTD file '%s' does not validate" % self.filename) 410 return None 411 if isinstance(source, unicode): 412 return source.encode(getattr(self, "encoding", "UTF-8")) 413 return source
414
415 - def getoutput(self):
416 """convert the units back to source""" 417 sources = [str(dtd) for dtd in self.units] 418 return "".join(sources)
419
420 - def makeindex(self):
421 """makes self.index dictionary keyed on entities""" 422 self.index = {} 423 for dtd in self.units: 424 if not dtd.isnull(): 425 self.index[dtd.entity] = dtd
426
427 - def _valid_store(self):
428 """Validate the store to determine if it is valid 429 430 This uses ElementTree to parse the DTD 431 432 @return: If the store passes validation 433 @rtype: Boolean 434 """ 435 if etree is not None: 436 try: 437 # #expand is a Mozilla hack and are removed as they are not valid in DTDs 438 dtd = etree.DTD(StringIO.StringIO(re.sub("#expand", "", self.getoutput()))) 439 except etree.DTDParseError: 440 return False 441 return True
442