--- lingea-trd-decoder.03.py	2007-10-28 17:16:49.000000000 +0100
+++ lingea-trd-decoder.py	2007-10-30 10:16:29.000000000 +0100
@@ -12,6 +12,7 @@
 # http://hp.vector.co.jp/authors/VA005784/cobuild/cobuildconv.html
 #
 # Version history:
+# 0.4 (30.10.2007) Patch by Petr Dlouhy, optional HTML generation
 # 0.3 (28.10.2007) Patch by Petr Dlouhy, cleanup, bugfix. More dictionaries.
 # 0.2 (19.7.2007) Changes, documentation, first 100% dictionary
 # 0.1 (20.5.2006) Initial version based on Nomad specs
@@ -19,10 +20,13 @@
 # Supported dictionaries:
 # - Lingea Německý Kapesní slovník
 # - Lingea Anglický Kapesní slovník
+# - Lingea 2002 series (theoretically)
 #
 # Modified by:
-# - Petr Dlouhy (petr.dlouhy|email.cz)
+# - Petr Dlouhy (petr.dlouhy | email.cz)
 # Generalization of data block rules, sampleFlag 0x04, sound out fix, data phrase prefix with comment (0x04)
+# HTML output, debugging patch, options on command line
+#
 # <write your name here>
 #
 # This library is free software; you can redistribute it and/or
@@ -41,26 +45,77 @@
 # Boston, MA 02111-1307, USA.
 
 # VERSION
-VERSION = "0.3"
+VERSION = "0.4"
 
-# DEBUGING !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+import getopt, sys
+def usage():
+   print "Lingea Dictionary Decoder"
+   print "-------------------------"
+   print "Version: %s" % VERSION
+   print "Copyright (C) 2007 - Klokan Petr Pridal, Petr Dlouhy"
+   print
+   print "Usage: python lingea-trd-decoder.py DICTIONARY.trd > DICTIONARY.tab"
+   print "Result convertion by stardict-tools: /usr/lib/stardict-tools/tabfile"
+   print
+   print "    -o <num>      --out-style        : Output style"
+   print "                                          0   no tags"
+   print "                                          1   \\n tags"
+   print "                                          2   html tags"
+   print "    -h            --help             : Print this message"
+   print "    -d            --debug            : Degub"
+   print "    -r            --debug-header     : Degub - print headers"
+   print "    -a            --debug-all        : Degub - print all records"
+   print "    -l            --debug-limit      : Degub limit"
+   print
+   print "For HTML support in StarDict dictionary .ifo has to contain:"
+   print "sametypesequence=g"
+   print "!!! Change the .ifo file after generation by tabfile !!!"
+   print
+
+try:
+   opts, args = getopt.getopt(sys.argv[1:], "hdo:ral:", ["help", "debug", "out-style=", "debug-header", "debug-all", "debug-limit="])
+except getopt.GetoptError:
+   usage()
+   print "ERROR: Bad option"
+   sys.exit(2)
+   
+import locale
 DEBUG = False
-#DEBUG = True
-
-# If DEBUG and DEBUGHEADER, then print just all header records
-DEBUGHEADER = True
-#DEBUGHEADER = False
-
-# If DEBUG and DEBUGALL then print debug info for all records
+OUTSTYLE = 2
+DEBUGHEADER = False
 DEBUGALL = False
-#DEBUGALL = True
-
-# Number of wrong records for printing to stop during debugging 
 DEBUGLIMIT = 1
-
+for o, a in opts:
+   if o in ("-d", "-debug"):
+      # DEBUGING !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+      DEBUG = True
+   if o in ("-o", "--out-style"):
+      # output style
+      OUTSTYLE = locale.atoi(a)
+      if OUTSTYLE > 2:
+         usage()
+         print "ERROR: Output style not specified"
+   if o in ("-r", "--debug-header"):
+      # If DEBUG and DEBUGHEADER, then print just all header records
+      DEBUGHEADER = True
+   if o in ("-a", "--debug-all"):
+      # If DEBUG and DEBUGALL then print debug info for all records
+      DEBUGALL = True
+   if o in ("-h", "--help"):
+      usage()
+      sys.exit(0)
+   if o in ("-l", "--debug-limit"):
+      # Number of wrong records for printing to stop during debugging 
+      DEBUGLIMIT = locale.atoi(a)
 # FILENAME is a first parameter on the commandline now
 
-import sys
+if len(args) == 1:
+    FILENAME = args[0]
+else:
+   usage()
+   print "ERROR: You have to specify .trd file to decode"
+   sys.exit(2)
+
 from struct import *
 import re
 
@@ -104,11 +159,86 @@
     '#SP50#', '#SP51#','#SP52#','#SP53#','#SP54#','#SP55#','#SP56#','#SP57#','#SP58#','#SP59#',
     '#SP60#', '#SP61#','#SP62#','#SP63#']
 
-wordclass = ('#0#','n:','adj:','pron:','#4#','v:','adv:','#7#','#8#','#9#',
+wordclass = ('#0#','n:','adj:','pron:','#4#','v:','adv:','prep:','#8#','#9#',
     'intr:','phr:','#12#','#13#','#14#','#15#','#16#','#17#','#18#','#19#',
     '#20#','#21#','#22#','#23#','#24#','#25#','#26#','#27#','#28#','#29#',
     '#30#','#31#')
 
+if OUTSTYLE == 0:
+    tag = {
+           'db':(''   ,''),    #Data begining
+           'rn':(''   ,'\t'),  #Record name
+           'va':(''   ,' '),   #Header variant
+           'wc':('('  ,')'),   #WordClass
+           'pa':(''   ,' '),   #Header parts
+           'fo':('('  ,') '),  #Header forms
+           'on':('('  ,')' ),  #Header origin note
+           'pr':('['  ,']'),   #Header pronunciation
+           'dv':('{'  ,'} '),  #Header dataVariant
+           'sa':('`'  ,'`' ),  #Data sample
+           'sw':(''   ,''),    #Data sample wordclass; is no printed by Lingea
+           'do':('`'  ,'`' ),  #Data origin note
+           'df':(''   ,' '),   #Data definition
+           'ps':('"'  ,'" '),  #Data phrase short form
+           'pg':('"'  ,' = '), #Data phrase green
+           'pc':('`'  ,'`'),   #Data phrase comment; this comment is not printed by Lingea), but it seems useful
+           'p1':('"'  ,' = '), #Data phrase 1
+           'p2':(''   ,'" ' ), #Data phrase 2
+           'sp':('"'  ,' = ' ),#Data simple phrase
+           'b1':('"'  ,' = '), #Data phrase (block) 1
+           'b2':('" ' ,''),    #Data phrase (block) 2
+           }
+if OUTSTYLE == 1:
+    tag = {
+           'db':('•'       ,''),      #Data begining
+           'rn':(''        ,'\t'),    #Record name
+           'va':(''        ,' '),     #Header variant
+           'wc':(''        ,'\\n'),   #WordClass
+           'pa':(''        ,':\\n'),  #Header parts
+           'fo':('('       ,') '),    #Header forms
+           'on':('('       ,')\\n' ), #Header origin note
+           'pr':('['       ,']\\n'),  #Header pronunciation
+           'dv':('{'       ,'} '),    #Header dataVariant
+           'sa':('    '    ,'\\n' ),  #Data sample
+           'sw':(''        ,''),      #Data sample wordclass; is not printed by Lingea
+           'do':('    '    ,' ' ),    #Data origin note
+           'df':('    '    ,'\\n'),   #Data definition
+           'ps':('    '    ,'\\n'),   #Data phrase short form
+           'pg':('    '    ,' '),     #Data phrase green
+           'pc':('    '    ,' '),     #Data phrase comment; this comment is not printed by Lingea), but it seems useful
+           'p1':('    '    ,' '),     #Data phrase 1
+           'p2':('      '  ,'\\n' ),  #Data phrase 2
+           'sp':(''        ,'\\n' ),  #Data simple phrase
+           'b1':('"'       ,' = '),   #Data phrase (block) 1
+           'b2':('" '      ,''),      #Data phrase (block) 2
+          }
+if OUTSTYLE == 2:
+    tag = {
+           'db':('•'                                                 ,''),              #Data begining
+           'rn':(''                                                  ,'\t'),            #Record name
+           'va':(''                                                  ,' '),             #Header variant
+           'wc':('<span size="larger" color="darkred" weight="bold">','</span>\\n'),    #WordClass
+           'pa':('<span size="larger" color="darkred" weight="bold">',':</span>\\n'),   #Header parts
+           'fo':('('                                                 ,') '),            #Header forms
+           'on':('<span color="blue">('                              ,')</span>\\n' ),  #Header origin note
+           'pr':('['                                                 ,']\\n'),          #Header pronunciation
+           'dv':('{'                                                 ,'} '),            #Header dataVariant
+           'sa':('    <span color="darkred" weight="bold">'          ,'</span>\\n' ),   #Data sample
+           'sw':(''                                                  ,''),              #Data sample wordclass; is not printed by Lingea
+           'do':('    <span color="darkred" weight="bold">'          ,'</span> ' ),     #Data origin note
+           'df':('    <span weight="bold">'                          ,'</span>\\n'),    #Data definition
+           'ps':('    <span color="dimgray" weight="bold">'          ,'</span>\\n'),    #Data phrase short form
+           'pg':('    <span color="darkgreen" style="italic">'       ,'</span> '),      #Data phrase green
+           'pc':('    <span color="darkgreen" style="italic">'       ,'</span> '),      #Data phrase comment; this comment is not printed by Lingea), but it seems useful
+           'p1':('    <span color="dimgray" style="italic">'         ,'</span> '),      #Data phrase 1
+           'p2':('      '                                            ,'\\n' ),          #Data phrase 2
+           'sp':('<span color="cyan">'                               ,'</span>\\n' ),   #Data simple phrase
+           'b1':('"'                                                 ,' = '),           #Data phrase (block) 1
+           'b2':('" '                                                ,''),              #Data phrase (block) 2
+          }
+
+
+
 # Print color debug functions
 purple = lambda c: '\x1b[1;35m'+c+'\x1b[0m'
 blue = lambda c: '\x1b[1;34m'+c+'\x1b[0m'
@@ -237,21 +367,43 @@
 re_c = re.compile(r'<c(.*?)>')
 
 def decode_tag_postprocessing(input):
-    """Decode and replace tags used in lingea dictionaries"""
+    """Decode and replace tags used in lingea dictionaries; decode internal tags"""
     s = input
 
     # General information in http://www.david-zbiral.cz/El-slovniky-plnaverze.htm#_Toc151656799
     # TODO: Better output handling
 
-    # ?? <d...> 
-    s = re_d.sub(r'(\1)',s)
-    # ?? <w...>
-    s = re_w.sub(r'(\1)',s)
-    # ?? <y...>
-    s = re_y.sub(r'(\1)',s)
-    # ?? <c...>
-    s = re_c.sub(r'(\1)',s)
-    # ...
+    if OUTSTYLE == 0:
+        # ?? <d...> 
+        s = re_d.sub(r'(\1)',s)
+        # ?? <w...>
+        s = re_w.sub(r'(\1)',s)
+        # ?? <y...>
+        s = re_y.sub(r'(\1)',s)
+        # ?? <c...>
+        s = re_c.sub(r'(\1)',s)
+        # ...
+    if OUTSTYLE == 1:
+        # ?? <d...> 
+        s = re_d.sub(r'(\1)',s)
+        # ?? <w...>
+        s = re_w.sub(r'(\1)',s)
+        # ?? <y...>
+        s = re_y.sub(r'(\1)',s)
+        # ?? <c...>
+        s = re_c.sub(r'(\1)',s)
+        # ...
+    if OUTSTYLE == 2:
+        # ?? <d...> 
+        s = re_d.sub(r'<span size="small" color="blue">(\1)</span>',s)
+        # ?? <w...>
+        s = re_w.sub(r'<span size="small" color="blue" style="italic">\1</span>',s)
+        # ?? <y...>
+        s = re_y.sub(r'<span size="small" color="blue" style="italic">\1</span>',s)
+        # ?? <c...>
+        s = re_c.sub(r'<span size="small" color="blue" style="italic">\1</span>',s)
+        # ...
+
     return s
 
 def toBin( b ):
@@ -304,30 +456,29 @@
     if mainFlag & 0x01:
         headerFlag = outInt("HeaderFlag: %s") # Blocks in header
         if headerFlag & 0x01:
-            s = outStr("Header record name: %s").replace('_','') # Remove character '_' from index
-            result += "%s\t" % s
+            result += tag['rn'][0] + outStr("Header record name: %s").replace('_','') + tag['rn'][1]  # Remove character '_' from index
         if headerFlag & 0x02:
-            result += outStr("Header variant: %s")+' '
+            result += tag['va'][0] + outStr("Header variant: %s") + tag['va'][1]
         if headerFlag & 0x04:
             s = outInt("Header wordclass: %s")
             if s < 32:
-                result += '(' + wordclass[s] + ') '
+                result += tag['wc'][0] + wordclass[s] + tag['wc'][1]
             else:
                 raise "Header wordclass out of range in: %s" % result
         if headerFlag & 0x08:
-            result += outStr("Header parts: %s") + ' '
+            result += tag['pa'][0] + outStr("Header parts: %s") + tag['pa'][1]
         if headerFlag & 0x10:
-            result += '(' + outStr("Header forms: %s") + ') '
+            result += tag['fo'][0] + outStr("Header forms: %s") + tag['fo'][1]
         if headerFlag & 0x20:
-            result += '(' + outStr("Header origin note: %s") + ') ' 
+            result += tag['on'][0] + outStr("Header origin note: %s") +  tag['on'][1]
         if headerFlag & 0x80:
-            result += '[' + pronunciation_encode(outStr("Header pronunciation: %s")) + '] '
+            result += tag['pr'][0] + pronunciation_encode(outStr("Header pronunciation: %s")) + tag['pr'][1]
     
     # Header data block
     if mainFlag & 0x02:
         headerFlag = outInt("Header dataFlag: %s") # Blocks in header
         if headerFlag & 0x02:
-            result += '{' + outStr("Header dataVariant: %s")+'} '
+            result += tag['dv'][0] + outStr("Header dataVariant: %s")+ tag['dv'][1]
 
     # ??? Link elsewhere
     pass
@@ -345,30 +496,43 @@
 
     #result += ': '
     li = 0
-    
+ 
+    #print just every first word class identifier
+    # TODO: this is not systematic (should be handled by output)
+    global lastWordClass
+    lastWordClass = 0
+
     # DATA BLOCK(S)
     # -------------
     for i in range(0, itemCount):
-        item = ""
+        item = tag['db'][0] + tag['db'][1]
         ol = False
         dataFlag = outInt("DataFlag: %s -----------------------------")
         if dataFlag & 0x01: # small index
             sampleFlag = outInt("Data sampleFlag: %s")
             if sampleFlag & 0x01:
-                item += '`' + outStr("Data sample: %s")+'` ' 
+                result += tag['sa'][0] + outStr("Data sample: %s") +  tag['sa'][1]
             if sampleFlag & 0x04:
-                outInt("Data sample: %s")
+               s = outInt("Data wordclass: %s")
+               if s != lastWordClass: 
+                  if s < 32:
+                      result += tag['wc'][0] + wordclass[s] + tag['wc'][1]
+                  else:
+                      raise "Header wordclass out of range in: %s" % result
+               lastWordClass = s
             if sampleFlag & 0x08:
-                result += outStr("Data sample wordclass: %s") + '\\n'
+                result += tag['sw'][0] + outStr("Data sample wordclass: %s") + tag['sw'][1]
             if sampleFlag & 0x10:
                 outInt("Data sample Int: %s")
                 outInt("Data sample Int: %s")
                 outInt("Data sample Int: %s")
             if sampleFlag & 0x20:
-                item += '`' + outStr("Data origin note: %s")+'` ' 
+                item += tag['do'][0] + outStr("Data origin note: %s") + tag['do'][1]
             if sampleFlag & 0x80:
-                result += '[' + pronunciation_encode(outStr("Data sample pronunciation: %s")) + '] '
+                item += "    "
+                result += tag['pr'][0] + pronunciation_encode(outStr("Data sample pronunciation: %s")) + tag['pr'][1]
         if dataFlag & 0x02:
+            item += "    "
             subFlag = outInt("Data subFlag: %s")
             if subFlag == 0x80:
                 outStr("Data sub prefix: %s")
@@ -378,35 +542,37 @@
         if dataFlag & 0x04: # chart
             pass # ???
         if dataFlag & 0x08: # reference
-            item += outStr("Data definition: %s")+' ' 
+            item += tag['df'][0] + outStr("Data definition: %s") + tag['df'][1]
         if dataFlag & 0x10:
             pass # ???
         if dataFlag & 0x20: # phrase
             phraseFlag1 = outInt("Data phraseFlag1: %s")
             if phraseFlag1 & 0x01:
-                item += '"' + outStr("Data phrase short form: %s") + '" '
+                item += tag['ps'][0] + outStr("Data phrase short form: %s") + tag['ps'][1]
             if phraseFlag1 & 0x02:
                 phraseCount = outInt("Data phraseCount: %s")
                 for i in range(0, phraseCount):
                     phraseComment = outInt("Data phrase prefix")
-                    item += '"'+outStr("Data phrase 1: %s")+' = ' 
                     if phraseComment & 0x04:
-                        item += outStr("Data phrase comment: %s") 
-                    item += outStr("Data phrase 2: %s")+'" ' 
+                       item += tag['pc'][0] + outStr("Data phrase comment: %s")  + tag['pc'][1]
+                    item += tag['p1'][0] + outStr("Data phrase 1: %s") + tag['p1'][1]
+                    item += tag['p2'][0] + outStr("Data phrase 2: %s") + tag['p2'][1]
             if phraseFlag1 & 0x04:
                 phraseCount = outInt("Data phraseCount: %s")
                 for i in range(0, phraseCount):
                     phraseComment = outInt("Data phrase prefix")
-                    item += '"'+outStr("Data phrase 1: %s")+' = ' 
                     if phraseComment & 0x04:
-                        item += outStr("Data phrase comment: %s") 
-                    item += outStr("Data phrase 2: %s")+'" ' 
+                       item += tag['pc'][0] + outStr("Data phrase 1: %s")  + tag['pc'][1]
+                    item += tag['pg'][0] + outStr("Data phrase comment: %s")  + tag['pg'][1]
+                    item += tag['p2'][0] + outStr("Data phrase 2: %s") +  tag['p2'][1]
             if phraseFlag1 & 0x08:
                 phraseCount = outInt("Data simple phraseCount: %s")
                 for i in range(0, phraseCount):
-                    item += '"'+outStr("Data simple phrase: %s")+' = ' 
+                    item += "    "
+                    item += tag['sp'][0] + outStr("Data simple phrase: %s") +  tag['sp'][1]
             if phraseFlag1 & 0x40:
-                item += outStr("Data phrase short form: %s")+ ' '
+                item += tag['ps'][0] + outStr("Data phrase short form: %s") + tag['ps'][1]
+
 
             # TODO: be careful in changing the rules, to have back compatibility! 
         if dataFlag & 0x40: # reference, related language
@@ -427,23 +593,23 @@
                 result += "\\nphr: "
                 li = 1
                 ol = True
-                item += '"'+outStr("Data phrase 1: %s") + ' = '
+                item += tag['b1'][0]+outStr("Data phrase 1: %s") + tag['b1'][1]
                 out("Data phrase block: %s")
                 out("Data phrase block: %s")
                 out("Data phrase block: %s")
                 out("Data phrase block: %s")
-                item += outStr("Data phrase 2: %s") + '" '
+                item += tag['ds'][0] + outStr("Data phrase 2: %s") + tag['ds'][1]
             if flags == [0x80,0x80,0xF9,0xDF,0x9D,0x00,0x23,0x01]:
                 result += "\\nphr: "
                 li = 1
                 ol = True
-                item += '"'+outStr("Data phrase 1: %s") + ' = '
+                item += tag['b1'][0]+outStr("Data phrase 1: %s") + tag['b1'][1]
                 out("Data phrase block: %s")
                 out("Data phrase block: %s")
                 out("Data phrase block: %s")
                 out("Data phrase block: %s")
                 out("Data phrase block: %s")
-                item += outStr("Data phrase 2: %s") + '" '
+                item += tag['ds'][0] + outStr("Data phrase 2: %s") + tag['ds'][1]
         if ol:
             result += "\\n%d. %s" % (li, item)
         else:
@@ -462,19 +628,6 @@
 # MAIN
 ################################################################
 
-if len(sys.argv) > 1:
-    FILENAME = sys.argv[1]
-else:
-    print "Lingea Dictionary Decoder"
-    print "-------------------------"
-    print "Version: %s" % VERSION
-    print "Copyright (C) 2007 - Klokan Petr Pridal"
-    print
-    print "Usage: python lingea-trd-decoder.py DICTIONARY.trd > DICTIONARY.tab"
-    print "Result convertion by stardict-tools: /usr/lib/stardict-tools/tabfile"
-    print
-    print "ERROR: You have to specify .trd file to decode"
-    sys.exit(1)
 
 f = open(FILENAME,'rb')
 
@@ -515,10 +668,11 @@
     for i in range(1,entryCount):
         if not DEBUGALL:
             DEBUG = False
+        s = decode(getRec(i))
         if DEBUGHEADER:
-            s = decode(getRec(i))
-            print s.split('\t')[0]
-        if DEBUGLIMIT > 0 and not decode(getRec(i)).endswith('\n'):
+            # print s.split('\t')[0]
+            print s
+        if DEBUGLIMIT > 0 and not s.endswith('\n'):
             DEBUG = True
             print "-"*80
             print "%s) at address %s" % (i, toBin(index[i]))
