import sys import struct data=''' location job time count 2,3 designer 1 3 1,8 manager 1 3 3,11 programmer 1 6 5,6 manager 1 8 11,10 designer 1 5 12,4 programmer 1 5 13,4 designer 1 6 ''' job_code = { 'designer':0, 'programmer':1, 'manager':2 } columns = None class Record(object): def __init__(self): pass records = [] # assuming there are dimensions called time, location, count. # other dimensions will be considered categorical dimensions # of up to 255 different values LOCATION, TIME, COUNT, CATEGORICAL = range(4) def type_from_name(c): if c == 'location': return LOCATION elif c == 'time': return TIME elif c == 'count': return COUNT else: return CATEGORICAL # collect records line_no = 0 for line in data.split('\n'): line_no += 1 line = line.strip() if len(line) == 0: continue print line tokens = line.split() print tokens # read header if columns == None: columns = [(x,type_from_name(x)) for x in tokens if len(x) > 0] else: tokens = [x for x in tokens if len(x) > 0] if len(tokens) != len(columns): raise Exception("Problem on line %d: tokens and columns are different") record = Record() for i in xrange(len(tokens)): column_name, column_type = columns[i] value = tokens[i] if column_type == LOCATION: # x,y value = [int(x) for x in value.split(',')] elif column_type == TIME or column_type == COUNT: value = int(value) elif column_type == CATEGORICAL: codes = eval("%s_code" % column_name) # assuming there is a map with the coding of that category # assume categorical value = codes[value] record.__dict__[column_name] = value records.append(record) print record.__dict__ # write nanocube-ready .dmp file ostream = open('floorplan-nanocube-ready.dmp','w') def field_type_for(column_type): grid_levels = 4 # grid of 2^4 to 2^4 if column_type == LOCATION: return "nc_dim_quadtree_" + str(grid_levels) elif column_type == TIME: return "nc_dim_time_2" # make it 2 bytes time bins can go from 0 to 2^16-1 elif column_type == COUNT: return "nc_var_uint_4" # make it 4 bytes count bins can go from 0 to 2^32-1 elif column_type == CATEGORICAL: return "nc_dim_cat_1" ostream.write("name: floorplan\n") for i in xrange(len(columns)): column_name, column_type = columns[i] ostream.write("field: %s %s\n" % (column_name, field_type_for(column_type))) for i in xrange(len(columns)): column_name, column_type = columns[i] if column_type == CATEGORICAL: codes = eval("%s_code" % column_name) # assuming there is a map with the coding of that category inverted_codes = dict([(v,k) for k,v in codes.iteritems()]) keys = sorted(inverted_codes.keys()) for k in keys: ostream.write("valname: %s %d %s\n" % (column_name, k, inverted_codes[k])) ostream.write("\n") # end of header indication for r in records: # write binary records for column_name,column_type in columns: if column_type == LOCATION: ostream.write(struct.pack("