summaryrefslogtreecommitdiff
path: root/tools/icu/icutrim.py
diff options
context:
space:
mode:
authorSteven R. Loomis <srl@icu-project.org>2014-09-04 22:03:24 -0700
committerTrevor Norris <trev.norris@gmail.com>2014-10-01 12:16:51 -0700
commitac2857b12cd819b68405b15c3f8e95e48bcc32d8 (patch)
tree201772e4e1d201b8a84f584ff5155d36fb4082a0 /tools/icu/icutrim.py
parent95726b0fce0ae1ae60591f0a535515e8dabfd6df (diff)
downloadandroid-node-v8-ac2857b12cd819b68405b15c3f8e95e48bcc32d8.tar.gz
android-node-v8-ac2857b12cd819b68405b15c3f8e95e48bcc32d8.tar.bz2
android-node-v8-ac2857b12cd819b68405b15c3f8e95e48bcc32d8.zip
build, i18n: improve Intl build, add "--with-intl"
The two main goals of this change are: - To make it easier to build the Intl option using ICU (particularly, using a newer ICU than v8/Chromium's version) - To enable a much smaller ICU build with only English support The goal here is to get node.js binaries built this way by default so that the Intl API can be used. Additional data can be added at execution time (see Readme and wiki) More details are at https://github.com/joyent/node/pull/7719 In particular, this change adds the "--with-intl=" configure option to provide more ways of building "Intl": - "full-icu" picks up an ICU from deps/icu - "small-icu" is similar, but builds only English - "system-icu" uses pkg-config to find an installed ICU - "none" does nothing (no Intl) For Windows builds, the "full-icu" or "small-icu" options are added to vcbuild.bat. Note that the existing "--with-icu-path" option is not removed from configure, but may not be used alongside the new option. Wiki changes have already been made on https://github.com/joyent/node/wiki/Installation and a new page created at https://github.com/joyent/node/wiki/Intl (marked as provisional until this change lands.) Summary of changes: * README.md : doc updates * .gitignore : added "deps/icu" as this is the location where ICU is unpacked to. * Makefile : added the tools/icu/* files to cpplint, but excluded a problematic file. * configure : added the "--with-intl" option mentioned above. Calculate at config time the list of ICU source files to use and data packaging options. * node.gyp : add the new files src/node_i18n.cc/.h as well as ICU linkage. * src/node.cc : add call into node::i18n::InitializeICUDirectory(icu_data_dir) as well as new --icu-data-dir option and NODE_ICU_DATA env variable to configure ICU data loading. This loading is only relevant in the "small" configuration. * src/node_i18n.cc : new source file for the above Initialize.. function, to setup ICU as needed. * tools/icu : new directory with some tools needed for this build. * tools/icu/icu-generic.gyp : new .gyp file that builds ICU in some new ways, both on unix/mac and windows. * tools/icu/icu-system.gyp : new .gyp file to build node against a pkg-config detected ICU. * tools/icu/icu_small.json : new config file for the "English-only" small build. * tools/icu/icutrim.py : new tool for trimming down ICU data. Reads the above .json file. * tools/icu/iculslocs.cc : new tool for repairing ICU data manifests after trim operation. * tools/icu/no-op.cc : dummy file to force .gyp into using a C++ linker. * vcbuild.bat : added small-icu and full-icu options, to call into configure. * Fixed toolset dependencies, see https://github.com/joyent/node/pull/7719#issuecomment-54641687 Note that because of a bug in gyp {CC,CXX}_host must also be set. Otherwise gcc/g++ will be used by default for part of the build. Reviewed-by: Trevor Norris <trev.norris@gmail.com> Reviewed-by: Fedor Indutny <fedor@indutny.com>
Diffstat (limited to 'tools/icu/icutrim.py')
-rwxr-xr-xtools/icu/icutrim.py338
1 files changed, 338 insertions, 0 deletions
diff --git a/tools/icu/icutrim.py b/tools/icu/icutrim.py
new file mode 100755
index 0000000000..f6b54956c5
--- /dev/null
+++ b/tools/icu/icutrim.py
@@ -0,0 +1,338 @@
+#!/usr/bin/python
+#
+# Copyright (C) 2014 IBM Corporation and Others. All Rights Reserved.
+#
+# @author Steven R. Loomis <srl@icu-project.org>
+#
+# This tool slims down an ICU data (.dat) file according to a config file.
+#
+# See: http://bugs.icu-project.org/trac/ticket/10922
+#
+# Usage:
+# Use "-h" to get help options.
+
+import sys
+import shutil
+# for utf-8
+reload(sys)
+sys.setdefaultencoding("utf-8")
+
+import argparse
+import os
+import json
+import re
+
+endian=sys.byteorder
+
+parser = argparse.ArgumentParser(description="ICU Datafile repackager. Example of use: \"mkdir tmp ; python icutrim.py -D ~/Downloads/icudt53l.dat -T tmp -F trim_en.json -O icudt53l.dat\" you will then find a smaller icudt53l.dat in 'tmp'. ",
+ epilog="ICU tool, http://icu-project.org - master copy at http://source.icu-project.org/repos/icu/tools/trunk/scripts/icutrim.py")
+
+parser.add_argument("-P","--tool-path",
+ action="store",
+ dest="toolpath",
+ help="set the prefix directory for ICU tools")
+
+parser.add_argument("-D","--input-file",
+ action="store",
+ dest="datfile",
+ help="input data file (icudt__.dat)",
+ required=True)
+
+parser.add_argument("-F","--filter-file",
+ action="store",
+ dest="filterfile",
+ help="filter file (JSON format)",
+ required=True)
+
+parser.add_argument("-T","--tmp-dir",
+ action="store",
+ dest="tmpdir",
+ help="working directory.",
+ required=True)
+
+parser.add_argument("--delete-tmp",
+ action="count",
+ dest="deltmpdir",
+ help="delete working directory.",
+ default=0)
+
+parser.add_argument("-O","--outfile",
+ action="store",
+ dest="outfile",
+ help="outfile (NOT a full path)",
+ required=True)
+
+parser.add_argument("-v","--verbose",
+ action="count",
+ default=0)
+
+parser.add_argument('-e', '--endian', action='store', dest='endian', help='endian, big, little or host, your default is "%s".' % endian, default=endian, metavar='endianness')
+
+
+args = parser.parse_args()
+
+if args.verbose>0:
+ print "Options: "+str(args)
+
+if (os.path.isdir(args.tmpdir) and args.deltmpdir):
+ if args.verbose>1:
+ print "Deleting tmp dir %s.." % (args.tmpdir)
+ shutil.rmtree(args.tmpdir)
+
+if not (os.path.isdir(args.tmpdir)):
+ os.mkdir(args.tmpdir)
+else:
+ print "Please delete tmpdir %s before beginning." % args.tmpdir
+ sys.exit(1)
+
+if args.endian not in ("big","little","host"):
+ print "Unknown endianness: %s" % args.endian
+ sys.exit(1)
+
+if args.endian is "host":
+ args.endian = endian
+
+if not os.path.isdir(args.tmpdir):
+ print "Error, tmpdir not a directory: %s" % (args.tmpdir)
+ sys.exit(1)
+
+if not os.path.isfile(args.filterfile):
+ print "Filterfile doesn't exist: %s" % (args.filterfile)
+ sys.exit(1)
+
+if not os.path.isfile(args.datfile):
+ print "Datfile doesn't exist: %s" % (args.datfile)
+ sys.exit(1)
+
+if not args.datfile.endswith(".dat"):
+ print "Datfile doesn't end with .dat: %s" % (args.datfile)
+ sys.exit(1)
+
+outfile = os.path.join(args.tmpdir, args.outfile)
+
+if os.path.isfile(outfile):
+ print "Error, output file does exist: %s" % (outfile)
+ sys.exit(1)
+
+if not args.outfile.endswith(".dat"):
+ print "Outfile doesn't end with .dat: %s" % (args.outfile)
+ sys.exit(1)
+
+dataname=args.outfile[0:-4]
+
+
+## TODO: need to improve this. Quotes, etc.
+def runcmd(tool, cmd, doContinue=False):
+ if(args.toolpath):
+ cmd = os.path.join(args.toolpath, tool) + " " + cmd
+ else:
+ cmd = tool + " " + cmd
+
+ if(args.verbose>4):
+ print "# " + cmd
+
+ rc = os.system(cmd)
+ if rc is not 0 and not doContinue:
+ print "FAILED: %s" % cmd
+ sys.exit(1)
+ return rc
+
+## STEP 0 - read in json config
+fi= open(args.filterfile, "rb")
+config=json.load(fi)
+fi.close()
+
+if (args.verbose > 6):
+ print config
+
+if(config.has_key("comment")):
+ print "%s: %s" % (args.filterfile, config["comment"])
+
+## STEP 1 - copy the data file, swapping endianness
+endian_letter = "l"
+
+
+runcmd("icupkg", "-t%s %s %s""" % (endian_letter, args.datfile, outfile))
+
+## STEP 2 - get listing
+listfile = os.path.join(args.tmpdir,"icudata.lst")
+runcmd("icupkg", "-l %s > %s""" % (outfile, listfile))
+
+fi = open(listfile, 'rb')
+items = fi.readlines()
+items = [items[i].strip() for i in range(len(items))]
+fi.close()
+
+itemset = set(items)
+
+if (args.verbose>1):
+ print "input file: %d items" % (len(items))
+
+# list of all trees
+trees = {}
+RES_INDX = "res_index.res"
+remove = None
+# remove - always remove these
+if config.has_key("remove"):
+ remove = set(config["remove"])
+else:
+ remove = set()
+
+# keep - always keep these
+if config.has_key("keep"):
+ keep = set(config["keep"])
+else:
+ keep = set()
+
+def queueForRemoval(tree):
+ global remove
+ if not config.has_key("trees"):
+ # no config
+ return
+ if not config["trees"].has_key(tree):
+ return
+ mytree = trees[tree]
+ if(args.verbose>0):
+ print "* %s: %d items" % (tree, len(mytree["locs"]))
+ # do varible substitution for this tree here
+ if type(config["trees"][tree]) == str or type(config["trees"][tree]) == unicode:
+ treeStr = config["trees"][tree]
+ if(args.verbose>5):
+ print " Substituting $%s for tree %s" % (treeStr, tree)
+ if(not config.has_key("variables") or not config["variables"].has_key(treeStr)):
+ print " ERROR: no variable: variables.%s for tree %s" % (treeStr, tree)
+ sys.exit(1)
+ config["trees"][tree] = config["variables"][treeStr]
+ myconfig = config["trees"][tree]
+ if(args.verbose>4):
+ print " Config: %s" % (myconfig)
+ # Process this tree
+ if(len(myconfig)==0 or len(mytree["locs"])==0):
+ if(args.verbose>2):
+ print " No processing for %s - skipping" % (tree)
+ else:
+ only = None
+ if myconfig.has_key("only"):
+ only = set(myconfig["only"])
+ if (len(only)==0) and (mytree["treeprefix"] != ""):
+ thePool = "%spool.res" % (mytree["treeprefix"])
+ if (thePool in itemset):
+ if(args.verbose>0):
+ print "Removing %s because tree %s is empty." % (thePool, tree)
+ remove.add(thePool)
+ else:
+ print "tree %s - no ONLY"
+ for l in range(len(mytree["locs"])):
+ loc = mytree["locs"][l]
+ if (only is not None) and not loc in only:
+ # REMOVE loc
+ toRemove = "%s%s%s" % (mytree["treeprefix"], loc, mytree["extension"])
+ if(args.verbose>6):
+ print "Queueing for removal: %s" % toRemove
+ remove.add(toRemove)
+
+def addTreeByType(tree, mytree):
+ if(args.verbose>1):
+ print "(considering %s): %s" % (tree, mytree)
+ trees[tree] = mytree
+ mytree["locs"]=[]
+ for i in range(len(items)):
+ item = items[i]
+ if item.startswith(mytree["treeprefix"]) and item.endswith(mytree["extension"]):
+ mytree["locs"].append(item[len(mytree["treeprefix"]):-4])
+ # now, process
+ queueForRemoval(tree)
+
+addTreeByType("converters",{"treeprefix":"", "extension":".cnv"})
+addTreeByType("stringprep",{"treeprefix":"", "extension":".spp"})
+addTreeByType("translit",{"treeprefix":"translit/", "extension":".res"})
+addTreeByType("brkfiles",{"treeprefix":"brkitr/", "extension":".brk"})
+addTreeByType("brkdict",{"treeprefix":"brkitr/", "extension":"dict"})
+addTreeByType("confusables",{"treeprefix":"", "extension":".cfu"})
+
+for i in range(len(items)):
+ item = items[i]
+ if item.endswith(RES_INDX):
+ treeprefix = item[0:item.rindex(RES_INDX)]
+ tree = None
+ if treeprefix == "":
+ tree = "ROOT"
+ else:
+ tree = treeprefix[0:-1]
+ if(args.verbose>6):
+ print "procesing %s" % (tree)
+ trees[tree] = { "extension": ".res", "treeprefix": treeprefix, "hasIndex": True }
+ # read in the resource list for the tree
+ treelistfile = os.path.join(args.tmpdir,"%s.lst" % tree)
+ runcmd("iculslocs", "-i %s -N %s -T %s -l > %s" % (outfile, dataname, tree, treelistfile))
+ fi = open(treelistfile, 'rb')
+ treeitems = fi.readlines()
+ trees[tree]["locs"] = [treeitems[i].strip() for i in range(len(treeitems))]
+ fi.close()
+ if(not config.has_key("trees") or not config["trees"].has_key(tree)):
+ print " Warning: filter file %s does not mention trees.%s - will be kept as-is" % (args.filterfile, tree)
+ else:
+ queueForRemoval(tree)
+
+def removeList(count=0):
+ # don't allow "keep" items to creep in here.
+ global remove
+ remove = remove - keep
+ if(count > 10):
+ print "Giving up - %dth attempt at removal." % count
+ sys.exit(1)
+ if(args.verbose>1):
+ print "%d items to remove - try #%d" % (len(remove),count)
+ if(len(remove)>0):
+ oldcount = len(remove)
+ hackerrfile=os.path.join(args.tmpdir, "REMOVE.err")
+ removefile = os.path.join(args.tmpdir, "REMOVE.lst")
+ fi = open(removefile, 'wb')
+ for i in remove:
+ print >>fi, i
+ fi.close()
+ rc = runcmd("icupkg","-r %s %s 2> %s" % (removefile,outfile,hackerrfile),True)
+ if rc is not 0:
+ if(args.verbose>5):
+ print "## Damage control, trying to parse stderr from icupkg.."
+ fi = open(hackerrfile, 'rb')
+ erritems = fi.readlines()
+ fi.close()
+ #Item zone/zh_Hant_TW.res depends on missing item zone/zh_Hant.res
+ pat = re.compile("""^Item ([^ ]+) depends on missing item ([^ ]+).*""")
+ for i in range(len(erritems)):
+ line = erritems[i].strip()
+ m = pat.match(line)
+ if m:
+ toDelete = m.group(1)
+ if(args.verbose > 5):
+ print "<< %s added to delete" % toDelete
+ remove.add(toDelete)
+ else:
+ print "ERROR: could not match errline: %s" % line
+ sys.exit(1)
+ if(args.verbose > 5):
+ print " now %d items to remove" % len(remove)
+ if(oldcount == len(remove)):
+ print " ERROR: could not add any mor eitems to remove. Fail."
+ sys.exit(1)
+ removeList(count+1)
+
+# fire it up
+removeList(1)
+
+# now, fixup res_index, one at a time
+for tree in trees:
+ # skip trees that don't have res_index
+ if not trees[tree].has_key("hasIndex"):
+ continue
+ treebunddir = args.tmpdir
+ if(trees[tree]["treeprefix"]):
+ treebunddir = os.path.join(treebunddir, trees[tree]["treeprefix"])
+ if not (os.path.isdir(treebunddir)):
+ os.mkdir(treebunddir)
+ treebundres = os.path.join(treebunddir,RES_INDX)
+ treebundtxt = "%s.txt" % (treebundres[0:-4])
+ runcmd("iculslocs", "-i %s -N %s -T %s -b %s" % (outfile, dataname, tree, treebundtxt))
+ runcmd("genrb","-d %s -s %s res_index.txt" % (treebunddir, treebunddir))
+ runcmd("icupkg","-s %s -a %s%s %s" % (args.tmpdir, trees[tree]["treeprefix"], RES_INDX, outfile))