## GIStemp STEP1_comb_records

### The Python program comb_records.py

Here is the listing. The explanation will follow some months or weeks from now below, after the === bar.

```

#! /usr/bin/python

import bsddb, stationstring, monthlydata, string, sys

MIN_OVERLAP = 4

def average(new_sums, new_wgts, new_data, years):
for m in range(12):
sums_row = new_sums[m]
wgts_row = new_wgts[m]
data_row = new_data[m]
for n in range(years):
wgt = wgts_row[n]
if wgt == 0:
continue
data_row[n] = sums_row[n] / wgt

def final_average(new_sums, new_wgts, new_data, years, begin):
min, max = 9999, -9999
for m in range(12):
mon_min, mon_max = 9999, -9999
wgts_row = new_wgts[m]
for n in range(years):
wgt = wgts_row[n]
if wgt == 0:
continue
if mon_min > n:
mon_min = n
if mon_max < n:
mon_max = n
if min >  mon_min:
min = mon_min
if max < mon_max:
max = mon_max
if min == 0 and max == years - 1:
average(new_sums, new_wgts, new_data, years)
return begin
years = max - min + 1
begin = begin + min
end = begin + years - 1
for m in range(12):
new_sums[m] = new_sums[m][min: max + 1]
new_wgts[m] = new_wgts[m][min: max + 1]
average(new_sums, new_wgts, new_data, years)
return begin

def add(new_sums, new_wgts, diff, begin, record):
rec_begin = record['dict']['begin']
rec_years = record['years']
rec_data = record['data']
for m in range(12):
sums_row = new_sums[m]
wgts_row = new_wgts[m]
data_row = rec_data[m]
for n in range(rec_years):
datum = data_row[n]
if abs(datum - BAD) < 0.1:
continue
year = n + rec_begin
index = year - begin
sums_row[index] = sums_row[index] + datum - diff
wgts_row[index] = wgts_row[index] + 1

def get_longest_overlap(new_sums, new_wgts, new_data, begin, years, records):
end = begin + years - 1
average(new_sums, new_wgts, new_data, years)
ann_mean, ann_anoms = mon.annual()
overlap = 0
length = 0
for rec_id, record in records.items():
rec_ann_anoms = record['ann_anoms']
rec_ann_mean = record['ann_mean']
rec_years = record['years']
rec_begin = record['dict']['begin']
sum = wgt = 0
for n in range(rec_years):
rec_anom = rec_ann_anoms[n]
if abs(rec_anom - BAD) < 0.1:
continue
year = n + rec_begin
anom = ann_anoms[year - begin]
if abs(anom - BAD) < 0.1:
continue
wgt = wgt + 1
sum = sum + (rec_ann_mean + rec_anom) - (ann_mean + anom)
if wgt < MIN_OVERLAP:
continue
if wgt < overlap:
continue
overlap = wgt
diff = sum / wgt
best_id = rec_id
best_record = record
if overlap < MIN_OVERLAP:
return best_record, best_id, diff

def combine(*args):
new_sums, new_wgts, new_data, begin, years, records = args
record, rec_id, diff = apply(get_longest_overlap, args)
if abs(diff - BAD) < 0.1:
print "\tno other records okay"
return 0
del records[rec_id]
rec_begin = record['dict']['begin']
print "\t", rec_id, rec_begin, record['years'] + rec_begin - 1, diff
return 1

def get_best(records):
ranks = {'MCDW': 4, 'USHCN': 3, 'SUMOFDAY': 2, 'UNKNOWN': 1}
best = 1
rids = records.keys()
rids.sort()
for rec_id in rids:
record = records[rec_id]
source = record['dict']['source']
rank = ranks
if rank >  best:
best = rank
best_rec = record
best_id = rec_id
if best > 1:
return best_rec, best_id
longest = 0
for rec_id in rids:
record = records[rec_id]
length = record['length']
if length > longest:
longest = length
longest_rec = record
longest_id = rec_id
return longest_rec, longest_id

def get_records(old_db, rec_ids):
res = {}
min, max = 9999, -9999
for rec_id in rec_ids:
s = old_db[rec_id]
st = stationstring.new(s)
dict = st.dict()
data, years = st.data_years()
assert years == len(data)
begin = dict['begin']
end = begin + years - 1
if min > begin:
min = begin
if max < end:
max = end
ann_mean, ann_anoms = mon.annual()
length = 0
for anom in ann_anoms:
length = length + 1
res[rec_id] = {'dict': dict, 'data': data,
'string': s, 'years': years,
'length': length, 'ann_anoms': ann_anoms,
'ann_mean': ann_mean}
return res, min, max

def get_new_data(record, begin, years):
sums, wgts, data = [None] * 12, [None] * 12, [None] *12
rec_data = record['data']
rec_begin, rec_years = record['dict']['begin'], record['years']
rec_end = rec_begin + rec_years - 1
for m in range(12):
sums_row, wgts_row, data[m] = [0.0] * years,  * years, [BAD] * years
rec_row = rec_data[m]
for n in range(rec_years):
datum = rec_row[n]
if abs(datum - BAD)  < 0.1:
continue
# assert abs(datum - BAD) > 0.1, "%.20f %.20f" % (datum, BAD)
year = n + rec_begin
index = year - begin
sums_row[index] = datum
wgts_row[index] = 1
sums[m] = sums_row
wgts[m] = wgts_row
return sums, wgts, data

def fill_new_db(ids, rec_id_dict, old_db, new_db):
new_ids = []
for id in ids:
rec_ids = rec_id_dict[id]
print id
while 1:
records, begin, end = get_records(old_db, rec_ids)
if len(records) == 1:
rec_id, record = records.items()
new_db[rec_id] = record['string']
new_ids.append(rec_id)
break
years = end - begin + 1
record, rec_id = get_best(records)
rec_dict = record['dict']
source = rec_dict['source']
del records[rec_id]
new_sums, new_wgts, new_data = get_new_data(record, begin, years)
new_dict = rec_dict.copy()
new_dict['begin'] = begin
print "\t%s %s %s -- %s" % (rec_id, begin, begin + years -1,source)
ok_flag = 1
while ok_flag and records.keys():
ok_flag = combine(new_sums, new_wgts, new_data,
begin, years, records)
begin = final_average(new_sums, new_wgts, new_data, years, begin)
new_dict['begin'] = begin
s = stationstring.serialize(new_dict, new_data)
new_db[rec_id] = s
new_ids.append(rec_id)
rec_ids = records.keys()
if not rec_ids:
break
new_db['IDS'] = string.join(new_ids, ' ')

def get_ids(old_db):
rec_ids = string.split(old_db['IDS'])
rec_ids.sort()
rec_id_dict = {}
for rec_id in rec_ids:
id = rec_id[:-1]
try:
rec_id_dict[id].append(rec_id)
except KeyError:
rec_id_dict[id] = [rec_id]
ids = rec_id_dict.keys()
ids.sort()
return ids, rec_id_dict

def main():
stemname = sys.argv
old_bdb_name = stemname + '.bdb'
new_bdb_name = stemname + '.combined.bdb'
print "opening db file '%s'" % old_bdb_name
old_db = bsddb.hashopen(old_bdb_name, 'r')
print "creating db file '%s'" % new_bdb_name
new_db = bsddb.hashopen(new_bdb_name, 'n')
ids, rec_id_dict = get_ids(old_db)
fill_new_db(ids, rec_id_dict, old_db, new_db)

if __name__ == '__main__':
main()

```

=========================================================

The analysis of this program will have to wait a fair while as I’m still finishing another step. It looks like it’s a well done library of utilities so I’ll likely leave it to the end. A technical managerial sort interested in things from Stonehenge to computer science. My present "hot buttons' are the mythology of Climate Change and ancient metrology; but things change...
This entry was posted in GISStemp Technical and Source Code and tagged , , , , , . Bookmark the permalink.

### 4 Responses to GIStemp STEP1_comb_records

1. Peter O'Neill says:

This listing does not correspond to the version in the source archive I downloaded. In particular get_best(records) is called, but not defined in the file (but can be found in the file as I downloaded it), and combine(*args) is very different. Compare with the definition in the file as I downloaded it:

def combine(*args):
new_sums, new_wgts, new_data, begin, years, records = args
record, rec_id, diff = apply(get_longest_overlap, args)
if abs(diff – BAD) < 0.1:
print “\tno other records okay”
return 0
del records[rec_id]
rec_begin = record[‘dict’][‘begin’]
print “\t”, rec_id, rec_begin, record[‘years’] + rec_begin – 1, diff
return 1

The comb_records.py file I have is 8,266 bytes and dated May 20, 2008, and was unchanged in the source archives I downloaded last December, this January and March.

2. Peter O'Neill says:

Apologies for the loss of indentation in the Python source in the previous message. Refer to the archive for the correct indentation, which has been lost when pasted into the HTML reply.

3. Peter O'Neill says:

I’ve looked at the listing above more carefully, and noticed that what has happened is that lines have been lost, so that combine(*args) as shown above is in fact the first three and a half lines of the “real” combine(*args), followed by the last fourteen and a half lines from the “real” get_best(records), with the two half lines run into each other. This also explains the disappearance of get_best.

4. E.M.Smith says:

I think I figured out what happened and have fixed it.

WordPress was, IMHO, looking for HTML tags (even though this text was bracketed with the “pre” pre-formatted tags) and when it found a matching lessthan greaterthan set (in its opinion) would try to evaluated that text as html and then just drop it if it was not HTML.

I’ve put back in the missing text using the ampersand escaped .lt. and .gt. and we’ll see.

Guess that now I get to go through all the code again looking for less than full text near missing less than signs…