GIStemp STEP1_comb_pieces

The Python program comb_pieces.py

Here is the listing. The explanation will follow some months or weeks from now below, after the === bar.

   
 
#! /usr/bin/python

import bsddb, stationstring, monthlydata, string, sys, listStats
import comb_records

BAD = '???'
MIN_MID_YEARS = 5            # MIN_MID_YEARS years closest to ymid
BUCKET_RADIUS = 10
BIG_X = 1.                     # "we may then use a fraction X of ..."
GET_ALL = 1
VERBOSE = 0

HELENA_IN_NAME = 'combine_pieces_helena.in'

def get_longest_overlap(new_sums, new_wgts, new_data, begin, years, records):
    end = begin + years - 1
    comb_records.average(new_sums, new_wgts, new_data, years)
    mon = monthlydata.new(new_data, BAD)
    ann_mean, ann_anoms = mon.annual()
    overlap = 0
    length = 0
    for rec_id, record in records.items():
        rec_ann_anoms = record['ann_anoms']
        rec_years = record['years']
        rec_begin = record['dict']['begin']
        sum = wgt = 0
        for n in range(rec_years):
            rec_anom = rec_ann_anoms[n]
            if abs(rec_anom - BAD) < 0.1:
                continue
            year = n + rec_begin
            anom = ann_anoms[year - begin]
            if abs(anom - BAD) < 0.1:
                continue
            wgt = wgt + 1
            sum = sum + rec_anom - anom
        if wgt < overlap:
            continue
        overlap = wgt
        # diff = sum / wgt
        best_id = rec_id
        best_record = record
    ignore_me = "ignore_me"
    return best_record, best_id, ignore_me

def get_actual_endpoints(new_wgts, begin, years):
    tmp_begin = 9999
    tmp_end = 0
    for m in range(12):
        wgts_row = new_wgts[m]
        for n in range(years):
            if wgts_row[n] == 0:
                continue
            year = n + begin
            if year < tmp_begin:
                tmp_begin = year
            if tmp_end < year:
                tmp_end = year
    return tmp_begin, tmp_end

def find_quintuples(new_sums, new_wgts, new_data,
                    begin, years, record, rec_begin,
                    new_id=None, rec_id=None):
    end = begin + years - 1

    rec_begin = record['dict']['begin']
    rec_end = rec_begin + record['years'] - 1

    actual_begin, actual_end = get_actual_endpoints(new_wgts, begin, years)

    max_begin = max(actual_begin, rec_begin)
    min_end = min(actual_end, rec_end)
    middle_year = int(.5 * (max_begin + min_end) + 0.5)
    print "max begin: %s\tmin end: %s" % (max_begin, min_end)

    comb_records.average(new_sums, new_wgts, new_data, years)
    mon = monthlydata.new(new_data, BAD)
    ann_mean, sublist1 = mon.annual()
    ignore_me, ann_std_dev = listStats.meanAndSigma(sublist1, None, None, BAD)
    print "ann_std_dev = %s" % ann_std_dev
##      print sublist1, len(sublist1)
    offset1 = (middle_year - begin)
    len1 = len(sublist1)

    sublist2 = record['ann_anoms']
    rec_ann_mean = record['ann_mean']
##      print sublist2, len(sublist2)
    offset2 = (middle_year - rec_begin)
    len2 = len(sublist2)

    ov_success = 0
    okay_flag = 0
##      for year in range(begin, end + 1):
##          anom1 = sublist1[year - begin]
##          anom2 = BAD
##          if year >= rec_begin and year <= rec_end:
##              anom2 = sublist2[year - rec_begin]
##          print "%d %7.2f %7.2f" % (year, anom1, anom2)
    for rad in range(1, BUCKET_RADIUS + 1):
        if rad > 1:
            if VERBOSE:
                print "incrementing bucket radius: %s -> %s" % (rad - 1, rad)
        count1 = sum1 = 0
        count2 = sum2 = 0
        for i in range(0, rad + 1):
            for sign in [-1, 1]:
                if sign == 1 and i == 0:
                    continue
                index1 = i * sign + offset1
                index2 = i * sign + offset2
                #print "*\t%s\t%s*" % (index1, index2)
                #print "*\t%s\t%s\t%s*" % (middle_year, begin, rec_begin)
                if index1 < 0 or index1 >= len1:
                    anom1 = BAD
                else:
                    anom1 = sublist1[index1]
                if index2 < 0 or index2 >= len2:
                    anom2 = BAD
                else:
                    anom2 = sublist2[index2]
                if VERBOSE:
                    rec_mean = anom2 + rec_ann_mean
                    mean = anom1 + ann_mean
                    print "*%s\t%s\t%s\t%s\t%s*" % \
                          (middle_year, index1 + begin,
                           index2 + rec_begin,
                           mean,
                           rec_mean)
                if anom1 != BAD and (GET_ALL or count1 < MIN_MID_YEARS):
                    sum1 = sum1 + anom1 + ann_mean
                    count1 = count1 + 1
                if anom2 != BAD and (GET_ALL or count2 < MIN_MID_YEARS):
                    sum2 = sum2 + anom2 + rec_ann_mean
                    count2 = count2 + 1
        if count1 >= MIN_MID_YEARS and count2 >= MIN_MID_YEARS:
            # okay_flag = 1
            print "overlap success: %s %s" % (new_id, rec_id)
            ov_success = 1
            avg1 = sum1 / float(count1)
            avg2 = sum2 / float(count2)
            diff = abs(avg1 - avg2)
            print "diff = %s" % diff
            if diff < BIG_X * ann_std_dev:
                okay_flag = 1
                print "combination success: %s %s" % (new_id, rec_id)
            else:
                print "combination failure: %s %s" % (new_id, rec_id)
            break
    if not ov_success:
        print "overlap failure: %s %s" % (new_id, rec_id)
    print "counts: %s" % ((count1, count2),)
    return okay_flag

def combine(*args, **kwargs):
    new_sums, new_wgts, new_data, begin, years, records = args
    new_id = kwargs['new_id']
    end = begin + years - 1

    record, rec_id, ignore_me = apply(get_longest_overlap, args)
    rec_begin = record['dict']['begin']
    rec_end = rec_begin + record['years'] - 1

    print "\t", rec_id, rec_begin, record['years'] + rec_begin - 1

    is_okay = find_quintuples(new_sums, new_wgts, new_data,
                              begin, years, record, rec_begin,
                              new_id=new_id, rec_id=rec_id)

    if not is_okay:
        print "\t***no other pieces okay***"
        return 0
    else:
        del records[rec_id]
        comb_records.add(new_sums, new_wgts, 0.0, begin, record)
        print "\t", rec_id, rec_begin, record['years'] + rec_begin - 1
        return 1

def get_longest(records):
    longest = 0
    for rec_id, record in records.items():
        length = record['length']
        if length > longest:
            longest = length
            longest_rec = record
            longest_id = rec_id
    return longest_rec, longest_id

def fill_new_db(ids, rec_id_dict, old_db, new_db, helena_ds):
    new_ids = []
    for id in ids:
##          if id != '14761901001':
##              continue
        rec_ids = rec_id_dict[id]
        print id
        # if id > '15': break
        while 1:
            # print rec_ids
            records, begin, end = comb_records.get_records(old_db, rec_ids)

            if len(records) == 1:
                rec_id, record = records.items()[0]
                new_db[rec_id] = record['string']
                new_ids.append(rec_id)
                break

            if helena_ds.has_key(id):
                id1, id2, this_year, month, summand = helena_ds[id]
                dict = records[id1]['dict']
                data = records[id1]['data']
                begin = dict['begin']
                years = records[id1]['years']
                for n in range(years):
                    year = n + begin
                    if year > this_year:
                        break
                    for m in range(12):
                        if year == this_year and m > month:
                            break
                        datum = data[m][n]
                        if datum == BAD:
                            continue
                        data[m][n] = datum + summand
                del helena_ds[id]
                mon = monthlydata.new(data, BAD)
                ann_mean, ann_anoms = mon.annual()
                records[id1]['ann_anoms'] = ann_anoms
                records[id1]['ann_mean'] = ann_mean

            record, rec_id = get_longest(records)
            years = end - begin + 1
            rec_dict = record['dict']
            source = rec_dict['source']
            del records[rec_id]
            new_sums, new_wgts, new_data = comb_records.get_new_data(
                record, begin, years)

##              long_ann_anoms = record['ann_anoms']
##              long_mean, long_sigma = listStats.meanAndSigma(long_ann_anoms,
##                                                             None, None, BAD)

            new_dict = rec_dict.copy()
            new_dict['begin'] = begin
            print "\t%s %s %s -- %s" % (rec_id, begin, begin + years -1,source)
            ok_flag = 1
            while ok_flag and records.keys():
                ok_flag = combine(new_sums, new_wgts, new_data,
                                  begin, years, records, new_id=rec_id)
            begin = comb_records.final_average(new_sums, new_wgts,
                                                  new_data, years, begin)
            new_dict['begin'] = begin
            s = stationstring.serialize(new_dict, new_data)
            new_db[rec_id] = s
            new_ids.append(rec_id)
            rec_ids = records.keys()
            if not rec_ids:
                break
    new_db['IDS'] = string.join(new_ids, ' ')

def main():
    stemname = sys.argv[1]
    old_name = stemname + '.bdb'
    new_name = stemname + '.pieces.bdb'
    print "opening db file '%s'" % old_name
    old_db = bsddb.hashopen(old_name, 'r')
    print "creating db file '%s'" % new_name
    new_db = bsddb.hashopen(new_name, 'n')
    global BAD
    BAD = string.atoi(old_db["IBAD"]) / 10.0
    BAD = 0.1 * int(BAD * 10.0)
    comb_records.BAD = BAD
    new_db['IBAD'] = old_db['IBAD']
    ids, rec_id_dict = comb_records.get_ids(old_db)

    try:
        helena_in = open(HELENA_IN_NAME, 'r')
        helena_ls = helena_in.readlines()
        helena_in.close()
        print "successfully read '%s'" % HELENA_IN_NAME
    except IOError:
        helena_ls = []
    helena_ds = {}
    for x in helena_ls:
        id1, id2, year, month, summand = string.split(x)
        year = string.atoi(year)
        month = string.atoi(month)
        summand = string.atof(summand)
        id = id1[:-1]
        helena_ds[id] = id1, id2, year, month, summand

    fill_new_db(ids, rec_id_dict, old_db, new_db, helena_ds)

if __name__ == '__main__':
    main()


=========================================================

The analysis of this program will have to wait a fair while as I’m still finishing another step. It looks like it’s a well done library of utilities so I’ll likely leave it to the end.

About E.M.Smith

A technical managerial sort interested in things from Stonehenge to computer science. My present "hot buttons' are the mythology of Climate Change and ancient metrology; but things change...
This entry was posted in GISStemp Technical and Source Code and tagged , , , , , . Bookmark the permalink.