From ec31631bb097be9780c7355d4183bfd5050c5af4 Mon Sep 17 00:00:00 2001
From: Magnus Ahltorp <map@kth.se>
Date: Thu, 19 Feb 2015 13:39:19 +0100
Subject: fetchallcerts.py: handle precerts submitcert.py: handle .zip files
 fetchallcerts.py: Always calculate full tree fetchallcerts.py: Cache level 16
 hashes fetchallcerts.py: Save STH

---
 tools/certtools.py     |  93 ++++++++++++++++++++++++++++++--
 tools/fetchallcerts.py | 142 +++++++++++++++++++++++++++++++++++++++++--------
 tools/submitcert.py    |  51 +++++++++++++-----
 3 files changed, 249 insertions(+), 37 deletions(-)
 mode change 100644 => 100755 tools/fetchallcerts.py

diff --git a/tools/certtools.py b/tools/certtools.py
index e1ca57a..6a144c9 100644
--- a/tools/certtools.py
+++ b/tools/certtools.py
@@ -11,6 +11,8 @@ import sys
 import hashlib
 import ecdsa
 import datetime
+import cStringIO
+import zipfile
 
 publickeys = {
     "https://ct.googleapis.com/pilot/":
@@ -44,11 +46,14 @@ def get_cert_info(s):
 
 
 def get_pemlike(filename, marker):
+    return get_pemlike_from_file(open(filename), marker)
+
+def get_pemlike_from_file(f, marker):
     entries = []
     entry = ""
     inentry = False
 
-    for line in open(filename):
+    for line in f:
         line = line.strip()
         if line == "-----BEGIN " + marker + "-----":
             entry = ""
@@ -63,6 +68,10 @@ def get_pemlike(filename, marker):
 def get_certs_from_file(certfile):
     return get_pemlike(certfile, "CERTIFICATE")
 
+def get_certs_from_string(s):
+    f = cStringIO.StringIO(s)
+    return get_pemlike_from_file(f, "CERTIFICATE")
+
 def get_eckey_from_file(keyfile):
     keys = get_pemlike(keyfile, "EC PRIVATE KEY")
     assert len(keys) == 1
@@ -138,6 +147,10 @@ def get_entries(baseurl, start, end):
         print "ERROR:", e.read()
         sys.exit(1)
 
+def extract_precertificate(precert_chain_entry):
+    (precert, certchain) = unpack_tls_array(precert_chain_entry, 3)
+    return (precert, certchain)
+
 def decode_certificate_chain(packed_certchain):
     (unpacked_certchain, rest) = unpack_tls_array(packed_certchain, 3)
     assert len(rest) == 0
@@ -235,8 +248,13 @@ def unpack_mtl(merkle_tree_leaf):
     leaf_type = merkle_tree_leaf[1:2]
     timestamped_entry = merkle_tree_leaf[2:]
     (timestamp, entry_type) = struct.unpack(">QH", timestamped_entry[0:10])
-    (leafcert, rest_entry) = unpack_tls_array(timestamped_entry[10:], 3)
-    return (leafcert, timestamp)
+    if entry_type == 0:
+        issuer_key_hash = None
+        (leafcert, rest_entry) = unpack_tls_array(timestamped_entry[10:], 3)
+    elif entry_type == 1:
+        issuer_key_hash = timestamped_entry[10:42]
+        (leafcert, rest_entry) = unpack_tls_array(timestamped_entry[42:], 3)
+    return (leafcert, timestamp, issuer_key_hash)
 
 def get_leaf_hash(merkle_tree_leaf):
     leaf_hash = hashlib.sha256()
@@ -284,3 +302,72 @@ def build_merkle_tree(layer0):
         current_layer = next_merkle_layer(current_layer)
         layers.append(current_layer)
     return layers
+
+def print_inclusion_proof(proof):
+    audit_path = proof[u'audit_path']
+    n = proof[u'leaf_index']
+    level = 0
+    for s in audit_path:
+        entry = base64.b16encode(base64.b64decode(s))
+        n ^= 1
+        print level, n, entry
+        n >>= 1
+        level += 1
+
+def get_one_cert(store, i):
+    filename = i / 10000
+    zf = zipfile.ZipFile("%s/%04d.zip" % (store, i / 10000))
+    cert = zf.read("%08d" % i)
+    zf.close()
+    return cert
+
+def get_hash_from_certfile(cert):
+    for line in cert.split("\n"):
+        if line.startswith("-----"):
+            return None
+        if line.startswith("Leafhash: "):
+            return base64.b16decode(line[len("Leafhash: "):])
+    return None
+
+def get_proof(store, tree_size, n):
+    hash = get_hash_from_certfile(get_one_cert(store, n))
+    return get_proof_by_hash(args.baseurl, hash, tree_size)
+
+def get_certs_from_zipfiles(zipfiles, firstleaf, lastleaf):
+    for i in range(firstleaf, lastleaf + 1):
+        try:
+            yield zipfiles[i / 10000].read("%08d" % i)
+        except KeyError:
+            return
+
+def get_merkle_hash_64k(store, blocknumber, write_to_cache=False):
+    hashfilename = "%s/%04x.64khash" % (store, blocknumber)
+    try:
+        hash = base64.b16decode(open(hashfilename).read())
+        assert len(hash) == 32
+        return ("hash", hash)
+    except IOError:
+        pass
+    firstleaf = blocknumber * 65536
+    lastleaf = firstleaf + 65535
+    firstfile = firstleaf / 10000
+    lastfile = lastleaf / 10000
+    zipfiles = {}
+    for i in range(firstfile, lastfile + 1):
+        try:
+            zipfiles[i] = zipfile.ZipFile("%s/%04d.zip" % (store, i))
+        except IOError:
+            break
+    certs = get_certs_from_zipfiles(zipfiles, firstleaf, lastleaf)
+    layer0 = [get_hash_from_certfile(cert) for cert in certs]
+    tree = build_merkle_tree(layer0)
+    calculated_hash = tree[-1][0]
+    for zf in zipfiles.values():
+        zf.close()
+    if len(layer0) != 65536:
+        return ("incomplete", (len(layer0), calculated_hash))
+    if write_to_cache:
+        f = open(hashfilename, "w")
+        f.write(base64.b16encode(calculated_hash))
+        f.close()
+    return ("hash", calculated_hash)
diff --git a/tools/fetchallcerts.py b/tools/fetchallcerts.py
old mode 100644
new mode 100755
index 2276e68..866bb43
--- a/tools/fetchallcerts.py
+++ b/tools/fetchallcerts.py
@@ -14,20 +14,25 @@ import struct
 import hashlib
 import itertools
 from certtools import *
+import zipfile
+import os
+import time
 
 parser = argparse.ArgumentParser(description='')
 parser.add_argument('baseurl', help="Base URL for CT server")
 parser.add_argument('--store', default=None, metavar="dir", help='Store certificates in directory dir')
-parser.add_argument('--start', default=0, metavar="n", type=int, help='Start at index n')
-parser.add_argument('--verify', action='store_true', help='Verify STH')
+parser.add_argument('--write-sth', action='store_true', help='Write STH')
 args = parser.parse_args()
 
 def extract_original_entry(entry):
     leaf_input =  base64.decodestring(entry["leaf_input"])
-    (leaf_cert, timestamp) = unpack_mtl(leaf_input)
+    (leaf_cert, timestamp, issuer_key_hash) = unpack_mtl(leaf_input)
     extra_data = base64.decodestring(entry["extra_data"])
+    if issuer_key_hash != None:
+        (precert, extra_data) = extract_precertificate(extra_data)
+        leaf_cert = precert
     certchain = decode_certificate_chain(extra_data)
-    return [leaf_cert] + certchain
+    return ([leaf_cert] + certchain, timestamp, issuer_key_hash)
 
 def get_entries_wrapper(baseurl, start, end):
     fetched_entries = 0
@@ -45,36 +50,129 @@ def print_layer(layer):
         print base64.b16encode(entry)
 
 sth = get_sth(args.baseurl)
+check_sth_signature(args.baseurl, sth)
 tree_size = sth["tree_size"]
 root_hash = base64.decodestring(sth["sha256_root_hash"])
 
+try:
+    if args.store:
+        oldsth = json.load(open(args.store + "/currentsth"))
+    else:
+        oldsth = None
+except IOError:
+    oldsth = None
+
+sth_timestamp = datetime.datetime.fromtimestamp(sth["timestamp"]/1000)
+since_timestamp = time.time() - sth["timestamp"]/1000
+
+print "Log last updated %s, %d seconds ago" % (sth_timestamp.ctime(), since_timestamp)
+
 print "tree size", tree_size
 print "root hash", base64.b16encode(root_hash)
 
-entries = get_entries_wrapper(args.baseurl, args.start, tree_size - 1)
+if oldsth:
+    if oldsth["tree_size"] == tree_size:
+        print "Tree size has not changed"
+        if oldsth["sha256_root_hash"] != sth["sha256_root_hash"]:
+            print "Root hash is different even though tree size is the same."
+            print "Log has violated the append-only property."
+            print "Old hash:", oldsth["sha256_root_hash"]
+            print "New hash:", sth["sha256_root_hash"]
+            sys.exit(1)
+        if oldsth["timestamp"] == sth["timestamp"]:
+            print "Timestamp has not changed"
+    else:
+        print "Tree size changed, old tree size was", oldsth["tree_size"]
 
-if args.verify:
+merkle_64klayer = []
+
+if args.store:
+    ncerts = None
+    for blocknumber in range(0, (tree_size / 65536) + 1):
+        (resulttype, result) = get_merkle_hash_64k(args.store, blocknumber, write_to_cache=True)
+        if resulttype == "incomplete":
+            (incompletelength, hash) = result
+            ncerts = blocknumber * 65536 + incompletelength
+            break
+        assert resulttype == "hash"
+        hash = result
+        merkle_64klayer.append(hash)
+        print blocknumber * 65536,
+        sys.stdout.flush()
+    print
+    print "ncerts", ncerts
+else:
+    ncerts = 0
+
+entries = get_entries_wrapper(args.baseurl, ncerts, tree_size - 1)
+
+if not args.store:
     layer0 = [get_leaf_hash(base64.decodestring(entry["leaf_input"])) for entry in entries]
 
     tree = build_merkle_tree(layer0)
 
     calculated_root_hash = tree[-1][0]
 
-    print "calculated root hash", base64.b16encode(calculated_root_hash)
-
-    if calculated_root_hash != root_hash:
-        print "fetched root hash and calculated root hash different, aborting"
-        sys.exit(1)
-
-elif args.store:
-    for entry, i in itertools.izip(entries, itertools.count(args.start)):
+else:
+    currentfilename = None
+    zf = None
+    for entry, i in itertools.izip(entries, itertools.count(ncerts)):
         try:
-            chain = extract_original_entry(entry)
-            f = open(args.store + "/" + ("%08d" % i), "w")
+            (chain, timestamp, issuer_key_hash) = extract_original_entry(entry)
+            zipfilename = args.store + "/" + ("%04d.zip" % (i / 10000))
+            if zipfilename != currentfilename:
+                if zf:
+                    zf.close()
+                zf = zipfile.ZipFile(zipfilename, "a",
+                                     compression=zipfile.ZIP_DEFLATED)
+                currentfilename = zipfilename
+            s = ""
+            s += "Timestamp: %s\n" % timestamp
+            leaf_input = base64.decodestring(entry["leaf_input"])
+            leaf_hash = get_leaf_hash(leaf_input)
+            s += "Leafhash: %s\n" % base64.b16encode(leaf_hash)
+            if issuer_key_hash:
+                s += "-----BEGIN PRECERTIFICATE-----\n"
+                s += base64.encodestring(chain[0]).rstrip() + "\n"
+                s += "-----END PRECERTIFICATE-----\n"
+                s += "\n"
+                chain = chain[1:]
             for cert in chain:
-                print >> f, "-----BEGIN CERTIFICATE-----"
-                print >> f, base64.encodestring(cert).rstrip()
-                print >> f, "-----END CERTIFICATE-----"
-                print >> f, ""
-        except AssertionError:
-            print "error for cert", i
+                s += "-----BEGIN CERTIFICATE-----\n"
+                s += base64.encodestring(cert).rstrip() + "\n"
+                s += "-----END CERTIFICATE-----\n"
+                s += "\n"
+            zf.writestr("%08d" % i, s)
+        except AssertionError, e:
+            print "error for cert", i, e
+    if zf:
+        zf.close()
+
+    for blocknumber in range(ncerts / 65536, (tree_size / 65536) + 1):
+        (resulttype, result) = get_merkle_hash_64k(args.store, blocknumber, write_to_cache=True)
+        if resulttype == "incomplete":
+            (incompletelength, hash) = result
+            ncerts = blocknumber * 65536 + incompletelength
+            merkle_64klayer.append(hash)
+            break
+        assert resulttype == "hash"
+        hash = result
+        merkle_64klayer.append(hash)
+        print blocknumber * 65536, base64.b16encode(hash)
+
+    tree = build_merkle_tree(merkle_64klayer)
+
+    calculated_root_hash = tree[-1][0]
+
+    assert ncerts == tree_size
+
+print "calculated root hash", base64.b16encode(calculated_root_hash)
+
+if calculated_root_hash != root_hash:
+    print "fetched root hash and calculated root hash different"
+    sys.exit(1)
+
+if args.store and args.write_sth:
+    f = open(args.store + "/currentsth", "w")
+    f.write(json.dumps(sth))
+    f.close()
diff --git a/tools/submitcert.py b/tools/submitcert.py
index 1b87b53..04b6ebe 100755
--- a/tools/submitcert.py
+++ b/tools/submitcert.py
@@ -15,6 +15,7 @@ from certtools import *
 import os
 import signal
 import select
+import zipfile
 
 from multiprocessing import Pool
 
@@ -29,13 +30,13 @@ if certfilepath[-1] == "/":
 else:
     certfiles = [certfilepath]
 
-def submitcert(certfile):
+def submitcert((certfile, cert)):
     timing = timing_point()
-    certs = get_certs_from_file(certfile)
+    certchain = get_certs_from_string(cert)
     timing_point(timing, "readcerts")
 
     try:
-        result = add_chain(baseurl, {"chain":map(base64.b64encode, certs)})
+        result = add_chain(baseurl, {"chain":map(base64.b64encode, certchain)})
     except SystemExit:
         print "EXIT:", certfile
         select.select([], [], [], 1.0)
@@ -49,7 +50,7 @@ def submitcert(certfile):
 
     try:
         if check_sig:
-            check_sct_signature(baseurl, certs[0], result)
+            check_sct_signature(baseurl, certchain[0], result)
             timing_point(timing, "checksig")
     except AssertionError, e:
         print "ERROR:", certfile, e
@@ -63,7 +64,7 @@ def submitcert(certfile):
 
     if lookup_in_log:
 
-        merkle_tree_leaf = pack_mtl(result["timestamp"], certs[0])
+        merkle_tree_leaf = pack_mtl(result["timestamp"], certchain[0])
 
         leaf_hash = get_leaf_hash(merkle_tree_leaf)
 
@@ -84,14 +85,14 @@ def submitcert(certfile):
 
         certchain = decode_certificate_chain(base64.decodestring(extra_data))
 
-        submittedcertchain = certs[1:]
+        submittedcertchain = certchain[1:]
 
         for (submittedcert, fetchedcert, i) in zip(submittedcertchain,
                                                    certchain, itertools.count(1)):
             print "cert", i, "in chain is the same:", submittedcert == fetchedcert
 
         if len(certchain) == len(submittedcertchain) + 1:
-            last_issuer = get_cert_info(certs[-1])["issuer"]
+            last_issuer = get_cert_info(certchain[-1])["issuer"]
             root_subject = get_cert_info(certchain[-1])["subject"]
             print "issuer of last cert in submitted chain and " \
                 "subject of last cert in fetched chain is the same:", \
@@ -105,20 +106,46 @@ def submitcert(certfile):
     timing_point(timing, "lookup")
     return timing["deltatimes"]
 
+def get_ncerts(certfiles):
+    n = 0
+    for certfile in certfiles:
+        if certfile.endswith(".zip"):
+            zf = zipfile.ZipFile(certfile)
+            n += len(zf.namelist())
+            zf.close()
+        else:
+            n += 1
+    return n
+
+def get_all_certificates(certfiles):
+    for certfile in certfiles:
+        if certfile.endswith(".zip"):
+            zf = zipfile.ZipFile(certfile)
+            for name in zf.namelist():
+                yield (name, zf.read(name))
+            zf.close()
+        else:
+            yield (certfile, open(certfile).read())
+
 p = Pool(16, lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
 
 nsubmitted = 0
 lastprinted = 0
-starttime = datetime.datetime.now()
 
-print len(certfiles), "certs"
+ncerts = get_ncerts(certfiles)
 
-submitcert(certfiles[0])
+print ncerts, "certs"
+
+certs = get_all_certificates(certfiles)
+
+submitcert(certs.next())
 nsubmitted += 1
 select.select([], [], [], 3.0)
 
+starttime = datetime.datetime.now()
+
 try:
-    for timing in p.imap_unordered(submitcert, certfiles[1:]):
+    for timing in p.imap_unordered(submitcert, certs):
         if timing == None:
             print "error"
             print "submitted", nsubmitted
@@ -129,7 +156,7 @@ try:
         deltatime = datetime.datetime.now() - starttime
         deltatime_f = deltatime.seconds + deltatime.microseconds / 1000000.0
         rate = nsubmitted / deltatime_f
-        if nsubmitted > lastprinted + len(certfiles) / 10:
+        if nsubmitted > lastprinted + ncerts / 10:
             print nsubmitted, "rate %.1f" % rate
             lastprinted = nsubmitted
         #print timing, "rate %.1f" % rate
-- 
cgit v1.1