GIStemp STEP1_comb_records

The Python program comb_records.py

Here is the listing. The explanation will follow some months or weeks from now below, after the === bar.

   
 
#! /usr/bin/python

import bsddb, stationstring, monthlydata, string, sys

BAD = '???'
MIN_OVERLAP = 4

def average(new_sums, new_wgts, new_data, years):
    for m in range(12):
        sums_row = new_sums[m]
        wgts_row = new_wgts[m]
        data_row = new_data[m]
        for n in range(years):
            wgt = wgts_row[n]
            if wgt == 0:
                assert data_row[n] == BAD
                continue
            data_row[n] = sums_row[n] / wgt

def final_average(new_sums, new_wgts, new_data, years, begin):
    min, max = 9999, -9999
    for m in range(12):
        mon_min, mon_max = 9999, -9999
        wgts_row = new_wgts[m]
        for n in range(years):
            wgt = wgts_row[n]
            if wgt == 0:
                continue
            if mon_min > n:
                mon_min = n
            if mon_max < n:
                mon_max = n
        if min >  mon_min:
            min = mon_min
        if max < mon_max:
            max = mon_max
    if min == 0 and max == years - 1:
        average(new_sums, new_wgts, new_data, years)
        return begin
    years = max - min + 1
    begin = begin + min
    end = begin + years - 1
    for m in range(12):
        new_sums[m] = new_sums[m][min: max + 1]
        new_wgts[m] = new_wgts[m][min: max + 1]
        new_data[m] = [BAD] * years
    average(new_sums, new_wgts, new_data, years)
    return begin

def add(new_sums, new_wgts, diff, begin, record):
    rec_begin = record['dict']['begin']
    rec_years = record['years']
    rec_data = record['data']
    for m in range(12):
        sums_row = new_sums[m]
        wgts_row = new_wgts[m]
        data_row = rec_data[m]
        for n in range(rec_years):
            datum = data_row[n]
            if abs(datum - BAD) < 0.1:
                continue
            year = n + rec_begin
            index = year - begin
            sums_row[index] = sums_row[index] + datum - diff
            wgts_row[index] = wgts_row[index] + 1

def get_longest_overlap(new_sums, new_wgts, new_data, begin, years, records):
    end = begin + years - 1
    average(new_sums, new_wgts, new_data, years)
    mon = monthlydata.new(new_data, BAD)
    ann_mean, ann_anoms = mon.annual()
    overlap = 0
    length = 0
    for rec_id, record in records.items():
        rec_ann_anoms = record['ann_anoms']
        rec_ann_mean = record['ann_mean']
        rec_years = record['years']
        rec_begin = record['dict']['begin']
        sum = wgt = 0
        for n in range(rec_years):
            rec_anom = rec_ann_anoms[n]
            if abs(rec_anom - BAD) < 0.1:
                continue
            year = n + rec_begin
            anom = ann_anoms[year - begin]
            if abs(anom - BAD) < 0.1:
                continue
            wgt = wgt + 1
            sum = sum + (rec_ann_mean + rec_anom) - (ann_mean + anom)
        if wgt < MIN_OVERLAP:
            continue
        if wgt < overlap:
            continue
        overlap = wgt
        diff = sum / wgt
        best_id = rec_id
        best_record = record
    if overlap < MIN_OVERLAP:
        return 0, 0, BAD
    return best_record, best_id, diff

def combine(*args):
    new_sums, new_wgts, new_data, begin, years, records = args
    record, rec_id, diff = apply(get_longest_overlap, args)
    if abs(diff - BAD) < 0.1:
        print "\tno other records okay"
        return 0
    del records[rec_id]
    add(new_sums, new_wgts, diff, begin, record)
    rec_begin = record['dict']['begin']
    print "\t", rec_id, rec_begin, record['years'] + rec_begin - 1, diff
    return 1

def get_best(records):
    ranks = {'MCDW': 4, 'USHCN': 3, 'SUMOFDAY': 2, 'UNKNOWN': 1}
    best = 1
    rids = records.keys()
    rids.sort()
    for rec_id in rids:
        record = records[rec_id]
        source = record['dict']['source']
        rank = ranks
        if rank >  best:
            best = rank
            best_rec = record
            best_id = rec_id
    if best > 1:
        return best_rec, best_id
    longest = 0
    for rec_id in rids:
        record = records[rec_id]
        length = record['length']
        if length > longest:
            longest = length
            longest_rec = record
            longest_id = rec_id
    return longest_rec, longest_id

def get_records(old_db, rec_ids):
    res = {}
    min, max = 9999, -9999
    for rec_id in rec_ids:
        s = old_db[rec_id]
        st = stationstring.new(s)
        dict = st.dict()
        data, years = st.data_years()
        assert years == len(data[0])
        begin = dict['begin']
        end = begin + years - 1
        if min > begin:
            min = begin
        if max < end:
            max = end
        mon = monthlydata.new(data, BAD)
        ann_mean, ann_anoms = mon.annual()
        length = 0
        for anom in ann_anoms:
            if anom != BAD:
                length = length + 1
        res[rec_id] = {'dict': dict, 'data': data,
                       'string': s, 'years': years,
                       'length': length, 'ann_anoms': ann_anoms,
                       'ann_mean': ann_mean}
    return res, min, max

def get_new_data(record, begin, years):
    sums, wgts, data = [None] * 12, [None] * 12, [None] *12
    rec_data = record['data']
    rec_begin, rec_years = record['dict']['begin'], record['years']
    rec_end = rec_begin + rec_years - 1
    for m in range(12):
        sums_row, wgts_row, data[m] = [0.0] * years, [0] * years, [BAD] * years
        rec_row = rec_data[m]
        for n in range(rec_years):
            datum = rec_row[n]
            # if datum == BAD:
            if abs(datum - BAD)  < 0.1:
                continue
            # assert abs(datum - BAD) > 0.1, "%.20f %.20f" % (datum, BAD)
            year = n + rec_begin
            index = year - begin
            sums_row[index] = datum
            wgts_row[index] = 1
        sums[m] = sums_row
        wgts[m] = wgts_row
    return sums, wgts, data

def fill_new_db(ids, rec_id_dict, old_db, new_db):
    new_ids = []
    for id in ids:
        rec_ids = rec_id_dict[id]
        print id
        while 1:
            records, begin, end = get_records(old_db, rec_ids)
            if len(records) == 1:
                rec_id, record = records.items()[0]
                new_db[rec_id] = record['string']
                new_ids.append(rec_id)
                break
            years = end - begin + 1
            record, rec_id = get_best(records)
            rec_dict = record['dict']
            source = rec_dict['source']
            del records[rec_id]
            new_sums, new_wgts, new_data = get_new_data(record, begin, years)
            new_dict = rec_dict.copy()
            new_dict['begin'] = begin
            print "\t%s %s %s -- %s" % (rec_id, begin, begin + years -1,source)
            ok_flag = 1
            while ok_flag and records.keys():
                ok_flag = combine(new_sums, new_wgts, new_data,
                                  begin, years, records)
            begin = final_average(new_sums, new_wgts, new_data, years, begin)
            new_dict['begin'] = begin
            s = stationstring.serialize(new_dict, new_data)
            new_db[rec_id] = s
            new_ids.append(rec_id)
            rec_ids = records.keys()
            if not rec_ids:
                break
    new_db['IDS'] = string.join(new_ids, ' ')

def get_ids(old_db):
    rec_ids = string.split(old_db['IDS'])
    rec_ids.sort()
    rec_id_dict = {}
    for rec_id in rec_ids:
        id = rec_id[:-1]
        try:
            rec_id_dict[id].append(rec_id)
        except KeyError:
            rec_id_dict[id] = [rec_id]
    ids = rec_id_dict.keys()
    ids.sort()
    return ids, rec_id_dict

def main():
    stemname = sys.argv[1]
    old_bdb_name = stemname + '.bdb'
    new_bdb_name = stemname + '.combined.bdb'
    print "opening db file '%s'" % old_bdb_name
    old_db = bsddb.hashopen(old_bdb_name, 'r')
    print "creating db file '%s'" % new_bdb_name
    new_db = bsddb.hashopen(new_bdb_name, 'n')
    global BAD
    BAD = string.atoi(old_db["IBAD"]) / 10.0
    BAD = 0.1 * int(BAD * 10.0)
    new_db['IBAD'] = old_db['IBAD']
    ids, rec_id_dict = get_ids(old_db)
    fill_new_db(ids, rec_id_dict, old_db, new_db)

if __name__ == '__main__':
    main()


=========================================================

The analysis of this program will have to wait a fair while as I’m still finishing another step. It looks like it’s a well done library of utilities so I’ll likely leave it to the end.

About E.M.Smith

A technical managerial sort interested in things from Stonehenge to computer science. My present "hot buttons' are the mythology of Climate Change and ancient metrology; but things change...
This entry was posted in GISStemp Technical and Source Code and tagged , , , , , . Bookmark the permalink.

4 Responses to GIStemp STEP1_comb_records

  1. Peter O'Neill says:

    This listing does not correspond to the version in the source archive I downloaded. In particular get_best(records) is called, but not defined in the file (but can be found in the file as I downloaded it), and combine(*args) is very different. Compare with the definition in the file as I downloaded it:

    def combine(*args):
    new_sums, new_wgts, new_data, begin, years, records = args
    record, rec_id, diff = apply(get_longest_overlap, args)
    if abs(diff – BAD) < 0.1:
    print “\tno other records okay”
    return 0
    del records[rec_id]
    add(new_sums, new_wgts, diff, begin, record)
    rec_begin = record[‘dict’][‘begin’]
    print “\t”, rec_id, rec_begin, record[‘years’] + rec_begin – 1, diff
    return 1

    The comb_records.py file I have is 8,266 bytes and dated May 20, 2008, and was unchanged in the source archives I downloaded last December, this January and March.

  2. Peter O'Neill says:

    Apologies for the loss of indentation in the Python source in the previous message. Refer to the archive for the correct indentation, which has been lost when pasted into the HTML reply.

  3. Peter O'Neill says:

    I’ve looked at the listing above more carefully, and noticed that what has happened is that lines have been lost, so that combine(*args) as shown above is in fact the first three and a half lines of the “real” combine(*args), followed by the last fourteen and a half lines from the “real” get_best(records), with the two half lines run into each other. This also explains the disappearance of get_best.

  4. E.M.Smith says:

    I think I figured out what happened and have fixed it.

    WordPress was, IMHO, looking for HTML tags (even though this text was bracketed with the “pre” pre-formatted tags) and when it found a matching lessthan greaterthan set (in its opinion) would try to evaluated that text as html and then just drop it if it was not HTML.

    I’ve put back in the missing text using the ampersand escaped .lt. and .gt. and we’ll see.

    Guess that now I get to go through all the code again looking for less than full text near missing less than signs…

Comments are closed.