-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatatables.py
364 lines (279 loc) · 9.96 KB
/
datatables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# -*- coding: utf-8 -*-
"""
"""
import string
import operator
import collections
from itertools import imap, ifilter
from functools import wraps
from _utils import SDict
from _signals import Signal
#from datarows import datarow_factory
class DataTableError(Exception):
pass
class DataTableCapacityError(DataTableError):
pass
class DataTableColumnError(DataTableError):
pass
class DataTableEventError(DataTableError):
pass
class DataTableTypeError(DataTableError):
pass
class Typed(object):
"""Base descriptor class."""
def __init__(self, name):
self.name = name
def __get__(self, instance, cls):
if instance is None:
return self
else:
return instance.__dict__[self.name]
def __set__(self, instance, value):
if not isinstance(value, self.expected_type):
raise TypeError("Expected %s (%s)"
% (self.expected_type, type(value)))
instance.__dict__[self.name] = value
class ColnameDesc(Typed):
expected_type = tuple
class NameDesc(Typed):
expected_type = str
class CapacityDesc(Typed):
expected_type = int
def get_datatype_name(cls):
""""""
if cls is None:
return 'NoneType'
s = str(cls.mro()[0])
return ''.join(x for x in s.split()[-1].split('.')[-1]
if x in string.letters)
def is_attribute_access(obj, fields):
"""Check if object is accessed by attributes or not."""
ag = operator.attrgetter(*fields)
try:
ag(obj)
return True
except AttributeError:
return False
def fields2index(fields, data):
out = []
for f in fields:
try:
out.append(data.index(f))
except ValueError:
pass
return out
def tuple_insert(data, index, value):
return data[:index] + (value,) + data[index:]
###############################################################################
# Data functions
###############################################################################
def f_distinct(data):
"""Yield distinct rows."""
seen = {}
# convert to 'tuple' for storing in dict (need hashable object).
for row in imap(tuple, data):
if row in seen:
continue
seen[row] = 1
yield row
def f_dup(data):
"""Yield duplicate rows."""
seen = {}
# convert to 'tuple' for storing in dict (need hashable object).
for row in imap(tuple, data):
if row in seen:
yield row
continue
seen[row] = 1
def _look(data):
for row in data:
print "- ilook -> {}".format(row)
yield row
###############################################################################
# Decorators
###############################################################################
def expr_decorator(method, colnames):
def inner(row):
sdict = SDict(zip(colnames, row))
return method(sdict)
return inner
def fluent(method):
@wraps(method)
def inner(self, *args, **kwargs):
if (not self.is_initialized or
(method.func_name.endswith('select') and not args)):
return self
# Instance creation
obj = self.__class__.__new__(self.__class__)
# method execution and new context
result = method(self, *args, **kwargs)
# Copy new context to new instance
obj.__dict__ = self.__dict__.copy()
if method.func_name == 'select':
obj.colnames = args
# Populate object
obj.list = []
obj.extend(result)
return obj
return inner
class DataTable(collections.MutableSequence):
u"""Container class for store data.
"""
colnames = ColnameDesc('colnames')
name = NameDesc('name')
capacity = CapacityDesc('capacity')
def __init__(self, *args, **kwargs):
self.colnames = kwargs.pop('colnames', ())
"""Column names."""
self.name = kwargs.pop('name', 'data_table')
"""Store the name os the 'DataTable' object."""
self.capacity = kwargs.pop('capacity', 0)
"""Store the max capacity of rows in the container."""
self.firstrow_header = kwargs.pop('firstrow_header', False)
"""Identify if the firstrow is a header line."""
# input_converter = kwargs.pop('input_converter', True)
# """Disable input conversion to 'tuple' object (more speed)."""
if kwargs:
raise DataTableError("Unexpected keyword arguments (%r)" % kwargs)
self.events = SDict(onAppend=Signal(), onInsert=Signal())
if self.capacity:
# Connect events to handlers.
self.events.onAppend.append(self._capacity_checker)
self.events.onInsert.append(self._capacity_checker)
self.list = list()
self.extend(args)
if args and not self.colnames:
if self.firstrow_header:
self.colnames = tuple(self.list.pop(0))
else:
self.colnames = tuple("C%i" % x
for x in range(len(self.list[0])))
def __iter__(self):
for row in self.list:
yield row
def __len__(self):
return len(self.list)
def __add__(self, value):
if value is not self:
self.extend(value)
else:
for row in self.list[:]:
self.append(row)
return self
__iadd__ = __add__
def __getitem__(self, item):
if isinstance(item, int):
return self.list[item]
elif isinstance(item, basestring):
if item not in self.colnames:
raise DataTableColumnError("Column '%s' not found" % item)
idx = self.colnames.index(item)
# Add header name in the first position
l = (item,) + tuple(x[idx] for x in self.list)
return l
def __delitem__(self, index):
def delitem(idx):
del self.list[idx]
if isinstance(index, slice):
[delitem(i) for i in xrange(*index.indices(len(self)))]
else:
delitem(index)
def __setitem__(self, index, row):
# TODO(Alejandro): implement case when item is a 'slice'
try:
self.list[index] = tuple(row)
except IndexError:
raise DataTableError("Index '%i' not created" % index)
def __repr__(self):
return "DataTable(%s)" % self.list
def __str__(self):
return str(self.list)
#
# built-in event handlers.
#
def _capacity_checker(self):
if self.count >= self.capacity:
raise DataTableCapacityError(
"Maximun capacity reached, stop ('%i')" % self.capacity)
#
# Special methods
#
def append(self, row):
""""""
self.events.onAppend()
self.list.append(tuple(row))
def insert(self, index, row):
self.events.onInsert()
self.list.insert(index, tuple(row))
@fluent
def filter(self, expr):
u"""Filter container data."""
data_kernel = self
if not is_attribute_access(data_kernel[0], self.colnames):
# Necessary for attribute access
expr = expr_decorator(expr, self.colnames)
return ifilter(expr, data_kernel)
#return (x for x in data_kernel if expr(x))
@fluent
def select(self, *fields, **kwargs):
u"""Select fields in the object."""
if not all(isinstance(x, basestring) for x in fields):
raise DataTableColumnError("Use only string types for parameter "
"'fields'.")
invalid_colnames = set(fields) - set(self.colnames)
if invalid_colnames:
raise DataTableColumnError("Column '%s' not found"
% ', '.join(invalid_colnames))
data_kernel = self
where = kwargs.pop('where', lambda x: x)
if not is_attribute_access(data_kernel[0], self.colnames):
# Necessary for attribute access
expr = expr_decorator(where, self.colnames)
if len(fields) == 1:
field_index = self.colnames.index(fields[0])
getter = lambda row: (row[field_index],)
else:
getter = operator.itemgetter(*fields2index(fields, self.colnames))
# getter transfor to tuples
return imap(getter, ifilter(expr, data_kernel))
@fluent
def distinct(self, *fields):
"""Return new 'datatable' with distinct rows."""
data_kernel = self
if fields:
data_kernel = self.select(*fields)
return f_distinct(data_kernel)
@fluent
def dup(self, *fields):
"""Return new 'datatable' with distinct rows."""
data_kernel = self
if fields:
data_kernel = self.select(*fields)
return f_dup(data_kernel)
@fluent
def add_field(self, name, value='', index=-1):
""""""
if callable(value):
expr = expr_decorator(value, self.colnames)
data_kernel = (tuple_insert(row, index, expr(row)) for row in self)
else:
data_kernel = (tuple_insert(row, index, value) for row in self)
if index == -1:
index = len(self.colnames)
self.colnames = tuple_insert(self.colnames, index, name)
return data_kernel
def clear(self, init=0, offset=0):
u"""Clear object."""
del self[init: init+offset]
@property
def is_initialized(self):
u""""""
return bool(self.count)
@property
def count(self):
return len(self)
@property
def shape(self):
rows = self.count
cols = max(len(x) for x in self)
return (cols, rows)