From 49b6e85963ef55fb6cfa1876fe825730f95658bc Mon Sep 17 00:00:00 2001 From: Linus Nordberg Date: Fri, 25 Nov 2016 00:47:13 +0100 Subject: Parallelise merge_fetch.py. NOTE: Not supporting permdb yet! We're still not passing the tests because merge_backup.py exits when the secondary merge disappears. --- tools/merge_fetch.py | 206 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 192 insertions(+), 14 deletions(-) (limited to 'tools/merge_fetch.py') diff --git a/tools/merge_fetch.py b/tools/merge_fetch.py index 7973fae..6accca4 100755 --- a/tools/merge_fetch.py +++ b/tools/merge_fetch.py @@ -10,14 +10,18 @@ import sys import struct import subprocess +import signal import logging from time import sleep +from multiprocessing import Process, Pipe +from random import Random from mergetools import get_logorder, verify_entry, get_new_entries, \ chunks, fsync_logorder, get_entries, add_to_logorder, \ - hexencode, parse_args, perm, flock_ex_or_fail, Status + hexencode, hexdecode, parse_args, perm, flock_ex_or_fail, Status, \ + terminate_child_procs from certtools import timing_point, write_file, create_ssl_context -def merge_fetch(args, config, localconfig): +def merge_fetch_sequenced(args, config, localconfig): paths = localconfig["paths"] storagenodes = config["storagenodes"] mergedb = paths["mergedb"] @@ -96,14 +100,191 @@ def merge_fetch(args, config, localconfig): else: return (tree_size, logorder[tree_size-1]) +def merge_fetch_worker(args, localconfig, storagenode, pipe): + paths = localconfig["paths"] + mergedb = paths["mergedb"] + chainsdb = perm(localconfig.get("dbbackend", "filedb"), mergedb + "/chains") + own_key = (localconfig["nodename"], + "%s/%s-private.pem" % (paths["privatekeys"], + localconfig["nodename"])) + to_fetch = set() + timeout = max(3, args.mergeinterval / 10) + while True: + if pipe.poll(timeout): + msg = pipe.recv().split() + if len(msg) < 2: + continue + cmd = msg[0] + ehash = msg[1] + if cmd == 'FETCH': + to_fetch.add(hexdecode(ehash)) + else: + logging.warning("%s: unknown command from parent: %s", + storagenode["name"], msg) + + if len(to_fetch) > 0: + logging.info("%s: fetching %d entries", storagenode["name"], + len(to_fetch)) + # TODO: Consider running the verifycert process longer. + verifycert = subprocess.Popen( + [paths["verifycert_bin"], paths["known_roots"]], + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + # Chunking for letting other workers take the chainsdb lock. + for chunk in chunks(list(to_fetch), 100): + chainsdb.lock_ex() + entries = get_entries(storagenode["name"], + "https://%s/" % storagenode["address"], + own_key, paths, chunk) + for ehash in chunk: + entry = entries[ehash] + verify_entry(verifycert, entry, ehash) + chainsdb.add(ehash, entry) + chainsdb.commit() + for ehash in chunk: + pipe.send('FETCHED %s' % hexencode(ehash)) + to_fetch.remove(ehash) + verifycert.communicate(struct.pack("I", 0)) + + new_entries = get_new_entries(storagenode["name"], + "https://%s/" % storagenode["address"], + own_key, paths) + if len(new_entries) > 0: + logging.info("%s: got %d new entries", storagenode["name"], + len(new_entries)) + for ehash in new_entries: + pipe.send('NEWENTRY %s' % hexencode(ehash)) + +def term(signal, arg): + terminate_child_procs() + sys.exit(1) + +def newworker(name, args): + my_conn, child_conn = Pipe() + p = Process(target=merge_fetch_worker, + args=tuple(args + [child_conn]), + name='merge_fetch_%s' % name) + p.daemon = True + p.start() + logging.debug("%s started, pid %d", name, p.pid) + return (name, my_conn, p) + +def merge_fetch_parallel(args, config, localconfig): + paths = localconfig["paths"] + storagenodes = config["storagenodes"] + mergedb = paths["mergedb"] + logorderfile = mergedb + "/logorder" + currentsizefile = mergedb + "/fetched" + + rand = Random() + signal.signal(signal.SIGTERM, term) + + procs = {} + for storagenode in storagenodes: + name = storagenode['name'] + procs[name] = newworker(name, [args, localconfig, storagenode]) + + logorder = get_logorder(logorderfile) # List of entries in log. + entries_in_log = set(logorder) # Set of entries in log. + entries_to_fetch = set() # Set of entries to fetch. + fetch = {} # Dict with entries to fetch. + while procs: + assert(not entries_to_fetch) + # Poll worker processes. + for name, pipe, p in procs.values(): + if not p.is_alive(): + logging.warning("%s is gone, restarting", name) + procs[name] = newworker(name, [args, localconfig, + storagenodes[name]]) + continue + logging.info("polling %s", name) + if pipe.poll(1): + msg = pipe.recv().split() + if len(msg) < 2: + logging.warning("unknown command from %s: %s", name, msg) + continue + cmd = msg[0] + ehash = msg[1] + if cmd == 'NEWENTRY': + logging.info("NEWENTRY at %s: %s", name, ehash) + entries_to_fetch.add(ehash) + logging.debug("entries_to_fetch: %s", entries_to_fetch) + elif cmd == 'FETCHED': + logging.info("FETCHED from %s: %s", name, ehash) + logorder.append(ehash) + add_to_logorder(logorderfile, hexdecode(ehash)) + fsync_logorder(logorderfile) + entries_in_log.add(ehash) + if ehash in entries_to_fetch: + entries_to_fetch.remove(ehash) + del fetch[ehash] + else: + logging.warning("unknown command from %s: %s", name, msg) + + # Ask workers to fetch entries. + logging.debug("nof entries to fetch including entries in log: %d", + len(entries_to_fetch)) + entries_to_fetch -= entries_in_log + logging.info("entries to fetch: %d", len(entries_to_fetch)) + # Add entries in entries_to_fetch as keys in dictionary fetch, + # values being a list of storage nodes, in randomised order. + for e in entries_to_fetch: + if not e in fetch: + l = procs.values() + rand.shuffle(l) + fetch[e] = l + # For each entry to fetch, treat its list of nodes as a + # circular list and ask the one in the front to fetch the + # entry. + while entries_to_fetch: + ehash = entries_to_fetch.pop() + nodes = fetch[ehash] + node = nodes.pop(0) + fetch[ehash] = nodes.append(node) + name, pipe, p = node + logging.info("asking %s to FETCH %s", name, ehash) + pipe.send("FETCH %s" % ehash) + + # Update the 'fetched' file. + logsize = len(logorder) + if logsize == 0: + last_hash = '' + else: + last_hash = logorder[logsize - 1] + logging.info("updating 'fetched' file: %d %s", logsize-1, last_hash) + currentsize = {"index": logsize - 1, "hash": last_hash} + logging.debug("writing to %s: %s", currentsizefile, currentsize) + write_file(currentsizefile, currentsize) + + return 0 + def main(): """ - Fetch new entries from all storage nodes, in sequence. + If no `--mergeinterval': + Fetch new entries from all storage nodes, in sequence, updating + the 'logorder' file and the 'chains' database. - Indicate the current position by writing the hash and its 'logorder' - index, 0-based, to 'fetched'. + Write 'fetched' to reflect how far in 'logorder' we've succesfully + fetched and verified. - Sleep some and start over, or exit if there's no `--mergeinterval'. + If `--mergeinterval': + Start one process per storage node, read their stdout for learning + about two things: (i) new entries ready for fetching ("NEWENTRY") and + (ii) new entries being succesfully fetched ("FETCHED"). + + Write to their stdin ("FETCH") when they should fetch another entry. + Update 'logorder' and the 'chains' database as we see new FETCHED + messages. + + Write 'fetched' to reflect how far in 'logorder' we've succesfully + fetched and verified. + + Keep doing this forever. + + NOTE: The point of having 'fetched' is that it can be atomically + written while 'logorder' cannot (unless we're fine with rewriting it + for each and every update, which we're not). + + TODO: Deduplicate some code. """ args, config, localconfig = parse_args() paths = localconfig["paths"] @@ -124,17 +305,14 @@ def main(): create_ssl_context(cafile=paths["https_cacertfile"]) - while True: - logsize, last_hash = merge_fetch(args, config, localconfig) + if args.mergeinterval: + return merge_fetch_parallel(args, config, localconfig) + else: + logsize, last_hash = merge_fetch_sequenced(args, config, localconfig) currentsize = {"index": logsize - 1, "hash": hexencode(last_hash)} logging.debug("writing to %s: %s", currentsizefile, currentsize) write_file(currentsizefile, currentsize) - if args.mergeinterval is None: - break - logging.debug("sleeping %d seconds", args.mergeinterval / 10) - sleep(args.mergeinterval / 10) - - return 0 + return 0 if __name__ == '__main__': sys.exit(main()) -- cgit v1.1