Initial preprocessing

df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24']})
make_date(df, 'date')
test_eq(df['date'].dtype, np.dtype('datetime64[ns]'))

df = pd.DataFrame({'date': ['2019-12-04', None, '2019-11-15', '2019-10-24']})
df = add_datepart(df, 'date')
test_eq(df.columns, ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'Elapsed'])
test_eq(df[df.Elapsed.isna()].shape,(1, 13))
df.head()

<ipython-input-7-02bfdad141e1>:10: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.
  for n in attr: df[prefix + n] = getattr(field.dt, n.lower())

df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24'],
                   'event': [False, True, False, True], 'base': [1,1,2,2]})
df = add_elapsed_times(df, ['event'], 'date', 'base')
df

df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'e': [True, False, True],
                   'date':['2019-12-04','2019-11-29','2019-11-15',]})
dt = df_shrink_dtypes(df)
test_eq(df['i'].dtype, 'int64')
test_eq(dt['i'], 'int8')

test_eq(df['f'].dtype, 'float64')
test_eq(dt['f'], 'float32')

# Default ignore 'object' and 'boolean' columns
test_eq(df['date'].dtype, 'object')
test_eq(dt['date'], 'category')

# Test categorifying 'object' type
dt2 = df_shrink_dtypes(df, obj2cat=False)
test_eq('date' not in dt2, True)

df_shrink(df) attempts to make a DataFrame uses less memory, by fit numeric columns into smallest datatypes. In addition:

boolean, category, datetime64[ns] dtype columns are ignored.
'object' type columns are categorified, which can save a lot of memory in large dataset. It can be turned off by obj2cat=False.
int2uint=True, to fit int types to uint types, if all data in the column is >= 0.
columns can be excluded by name using excl_cols=['col1','col2'].

To get only new column data types without actually casting a DataFrame, use df_shrink_dtypes() with all the same parameters for df_shrink().

df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'u':[0, 10,254],
                  'date':['2019-12-04','2019-11-29','2019-11-15']})
df2 = df_shrink(df, skip=['date'])

test_eq(df['i'].dtype=='int64' and df2['i'].dtype=='int8', True)
test_eq(df['f'].dtype=='float64' and df2['f'].dtype=='float32', True)
test_eq(df['u'].dtype=='int64' and df2['u'].dtype=='int16', True)
test_eq(df2['date'].dtype, 'object')

test_eq(df2.memory_usage().sum() < df.memory_usage().sum(), True)

# Test int => uint (when col.min() >= 0)
df3 = df_shrink(df, int2uint=True)
test_eq(df3['u'].dtype, 'uint8')  # int64 -> uint8 instead of int16

# Test excluding columns
df4 = df_shrink(df, skip=['i','u'])
test_eq(df['i'].dtype, df4['i'].dtype)
test_eq(df4['u'].dtype, 'int64')

Here's an example using the ADULT_SAMPLE dataset:

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
new_df = df_shrink(df, int2uint=True)
print(f"Memory usage: {df.memory_usage().sum()} --> {new_df.memory_usage().sum()}")

Memory usage: 3907448 --> 818665

df: A DataFrame of your data
cat_names: Your categorical x variables
cont_names: Your continuous x variables
y_names: Your dependent y variables
- Note: Mixed y's such as Regression and Classification is not currently supported, however multiple regression or classification outputs is
y_block: How to sub-categorize the type of y_names (CategoryBlock or RegressionBlock)
splits: How to split your data
do_setup: A parameter for if Tabular will run the data through the procs upon initialization
device: cuda or cpu
inplace: If True, Tabular will not keep a separate copy of your original DataFrame in memory. You should ensure pd.options.mode.chained_assignment is None before setting this
reduce_memory: fastai will attempt to reduce the overall memory usage by the inputted DataFrame with df_shrink

df = pd.DataFrame({'a':[0,1,2,0,2], 'b':[0,0,0,0,1]})
to = TabularPandas(df, cat_names='a')
t = pickle.loads(pickle.dumps(to))
test_eq(t.items,to.items)
test_eq(to.all_cols,to[['a']])

df = pd.DataFrame({'a':[0,1,2,0,2]})
to = TabularPandas(df, Categorify, 'a')
cat = to.procs.categorify
test_eq(cat['a'], ['#na#',0,1,2])
test_eq(to['a'], [1,2,3,1,3])
to.show()

df1 = pd.DataFrame({'a':[1,0,3,-1,2]})
to1 = to.new(df1)
to1.process()
#Values that weren't in the training df are sent to 0 (na)
test_eq(to1['a'], [2,1,0,0,3])
to2 = cat.decode(to1)
test_eq(to2['a'], [1,0,'#na#','#na#',2])

cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2]})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]])
test_eq(cat['a'], ['#na#',0,1,2])
test_eq(to['a'], [1,2,3,0,3])

df = pd.DataFrame({'a':pd.Categorical(['M','H','L','M'], categories=['H','M','L'], ordered=True)})
to = TabularPandas(df, Categorify, 'a')
cat = to.procs.categorify
test_eq(cat['a'], ['#na#','H','M','L'])
test_eq(to.items.a, [2,1,3,2])
to2 = cat.decode(to)
test_eq(to2['a'], ['M','H','L','M'])

cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_eq(to.vocab, ['a', 'b'])
test_eq(to['b'], [0,1,0,1,1])
to2 = to.procs.decode(to)
test_eq(to2['b'], ['a', 'b', 'a', 'b', 'b'])

cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_eq(to.vocab, ['a', 'b'])
test_eq(to['b'], [0,1,0,1,1])
to2 = to.procs.decode(to)
test_eq(to2['b'], ['a', 'b', 'a', 'b', 'b'])

cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'c', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_eq(to.vocab, ['a', 'b'])

norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a')
x = np.array([0,1,2,3,4])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (x-m)/s)

df1 = pd.DataFrame({'a':[5,6,7]})
to1 = to.new(df1)
to1.process()
test_close(to1['a'].values, (np.array([5,6,7])-m)/s)
to2 = norm.decode(to1)
test_close(to2['a'].values, [5,6,7])

norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a', splits=[[0,1,2],[3,4]])
x = np.array([0,1,2])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (np.array([0,1,2,3,4])-m)/s)

Currently, filling with the median, a constant, and the mode are supported.

fill1,fill2,fill3 = (FillMissing(fill_strategy=s) 
                     for s in [FillStrategy.median, FillStrategy.constant, FillStrategy.mode])
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]})
df1 = df.copy(); df2 = df.copy()
tos = (TabularPandas(df, fill1, cont_names='a'),
       TabularPandas(df1, fill2, cont_names='a'),
       TabularPandas(df2, fill3, cont_names='a'))
test_eq(fill1.na_dict, {'a': 1.5})
test_eq(fill2.na_dict, {'a': 0})
test_eq(fill3.na_dict, {'a': 1.0})

for t in tos: test_eq(t.cat_names, ['a_na'])

for to_,v in zip(tos, [1.5, 0., 1.]):
    test_eq(to_['a'].values, np.array([0, 1, v, 1, 2, 3, 4]))
    test_eq(to_['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))

fill = FillMissing() 
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4], 'b': [0,1,2,3,4,5,6]})
to = TabularPandas(df, fill, cont_names=['a', 'b'])
test_eq(fill.na_dict, {'a': 1.5})
test_eq(to.cat_names, ['a_na'])
test_eq(to['a'].values, np.array([0, 1, 1.5, 1, 2, 3, 4]))
test_eq(to['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))
test_eq(to['b'].values, np.array([0,1,2,3,4,5,6]))

procs = [Normalize, Categorify, FillMissing, noop]
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4]})
to = TabularPandas(df, procs, cat_names='a', cont_names='b')

#Test setup and apply on df_main
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(to['b_na'], [1,1,2,1,1,1,1])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(to.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})

df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(to['b_na'], [1,1,2,1,1,1,1])
test_eq(to['c'], [1,0,1,0,0,1,0])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(to.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})
test_eq(to.vocab, ['a','b'])

df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(df.a.dtype,int)
test_eq(to['b_na'], [1,1,2,1,1,1,1])
test_eq(to['c'], [1,0,1,0,0,1,0])

df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,np.nan,1,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, cat_names='a', cont_names='b', y_names='c', splits=[[0,1,4,6], [2,3,5]])

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,2,1,0,2,0])
test_eq(df.a.dtype,int)
test_eq(to['b_na'], [1,2,1,1,1,1,1])
test_eq(to['c'], [1,0,0,0,1,0,1])

from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter,_DatasetKind

_loaders = (_MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter)

Integration example

For a more in-depth explanation, see the tabular tutorial

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_test.drop('salary', axis=1, inplace=True)
df_main.head()

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="salary", splits=splits)

dls = to.dataloaders()
dls.valid.show_batch()

to.show()

We can decode any set of transformed data by calling to.decode_row with our raw data:

row = to.items.iloc[0]
to.decode_row(row)

age                                  33
workclass                       Private
fnlwgt                           248584
education                  Some-college
education-num                        10
marital-status       Married-civ-spouse
occupation                Other-service
relationship                    Husband
race                              White
sex                                Male
capital-gain                          0
capital-loss                          0
hours-per-week                       50
native-country            United-States
salary                             <50k
education-num_na                  False
Name: 3380, dtype: object

to_tst = to.new(df_test)
to_tst.process()
to_tst.items.head()

tst_dl = dls.valid.new(to_tst)
tst_dl.show_batch()

Other target types

Multi-label categories

one-hot encoded label

def _mock_multi_label(df):
    sal,sex,white = [],[],[]
    for row in df.itertuples():
        sal.append(row.salary == '>=50k')
        sex.append(row.sex == ' Male')
        white.append(row.race == ' White')
    df['salary'] = np.array(sal)
    df['male']   = np.array(sex)
    df['white']  = np.array(white)
    return df

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

df_main.head()

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
y_names=["salary", "male", "white"]

%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names=y_names, y_block=MultiCategoryBlock(encoded=True, vocab=y_names), splits=splits)

CPU times: user 77.2 ms, sys: 238 µs, total: 77.4 ms
Wall time: 76.7 ms

dls = to.dataloaders()
dls.valid.show_batch()

Not one-hot encoded

def _mock_multi_label(df):
    targ = []
    for row in df.itertuples():
        labels = []
        if row.salary == '>=50k': labels.append('>50k')
        if row.sex == ' Male':   labels.append('male')
        if row.race == ' White': labels.append('white')
        targ.append(' '.join(labels))
    df['target'] = np.array(targ)
    return df

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

df_main.head()

@MultiCategorize
def encodes(self, to:Tabular): 
    #to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0))
    return to
  
@MultiCategorize
def decodes(self, to:Tabular): 
    #to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}))
    return to

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="target", y_block=MultiCategoryBlock(), splits=splits)

CPU times: user 81 ms, sys: 178 µs, total: 81.2 ms
Wall time: 80.1 ms

to.procs[2].vocab

(#24) ['-','_','a','c','d','e','f','g','h','i'...]

Regression

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names='age', splits=splits)

CPU times: user 82.2 ms, sys: 508 µs, total: 82.7 ms
Wall time: 81.8 ms

to.procs[-1].means

{'fnlwgt': 193046.84475, 'education-num': 10.08025}

dls = to.dataloaders()
dls.valid.show_batch()

Not being used now - for multi-modal

class TensorTabular(fastuple):
    def get_ctxs(self, max_n=10, **kwargs):
        n_samples = min(self[0].shape[0], max_n)
        df = pd.DataFrame(index = range(n_samples))
        return [df.iloc[i] for i in range(n_samples)]

    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

class TabularLine(pd.Series):
    "A line of a dataframe that knows how to show itself"
    def show(self, ctx=None, **kwargs): return self if ctx is None else ctx.append(self)

class ReadTabLine(ItemTransform):
    def __init__(self, proc): self.proc = proc

    def encodes(self, row):
        cats,conts = (o.map(row.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
        return TensorTabular(tensor(cats).long(),tensor(conts).float())

    def decodes(self, o):
        to = TabularPandas(o, self.proc.cat_names, self.proc.cont_names, self.proc.y_names)
        to = self.proc.decode(to)
        return TabularLine(pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)}))

class ReadTabTarget(ItemTransform):
    def __init__(self, proc): self.proc = proc
    def encodes(self, row): return row[self.proc.y_names].astype(np.int64)
    def decodes(self, o): return Category(self.proc.classes[self.proc.y_names][o])

# enc = tds[1]
# test_eq(enc[0][0], tensor([2,1]))
# test_close(enc[0][1], tensor([-0.628828]))
# test_eq(enc[1], 1)

# dec = tds.decode(enc)
# assert isinstance(dec[0], TabularLine)
# test_close(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
# test_eq(dec[1], 'a')

# test_stdout(lambda: print(show_at(tds, 1)), """a               1
# b_na        False
# b               1
# category        a
# dtype: object""")

	workclass	education	marital-status	occupation	relationship	race	education-num_na	fnlwgt	education-num	age
0	State-gov	Masters	Never-married	#na#	Not-in-family	White	False	47569.994748	14.0	36.0
1	Federal-gov	11th	Never-married	Sales	Not-in-family	Black	False	166418.999287	7.0	50.0
2	Private	9th	Divorced	Farming-fishing	Not-in-family	Black	False	225603.000537	5.0	58.0
3	Local-gov	12th	Widowed	Adm-clerical	Not-in-family	White	False	48055.004282	8.0	55.0
4	Federal-gov	Prof-school	Divorced	Prof-specialty	Not-in-family	White	False	66504.003988	15.0	57.0
5	Private	Some-college	Never-married	Adm-clerical	Unmarried	Asian-Pac-Islander	False	91274.998927	10.0	36.0
6	State-gov	Bachelors	Married-civ-spouse	Exec-managerial	Husband	White	False	391584.996528	13.0	49.0
7	Self-emp-not-inc	1st-4th	Divorced	Craft-repair	Not-in-family	White	False	130435.999390	2.0	71.0
8	Private	Bachelors	Never-married	Prof-specialty	Own-child	White	False	62507.003940	13.0	22.0
9	Private	HS-grad	Married-civ-spouse	Handlers-cleaners	Own-child	White	False	236696.000903	9.0	24.0

Tabular core

Initial preprocessing

`make_date`[source]

`add_datepart`[source]

`add_elapsed_times`[source]

`cont_cat_split`[source]

`df_shrink_dtypes`[source]

`df_shrink`[source]

`class` `Tabular`[source]

`class` `TabularPandas`[source]

`class` `TabularProc`[source]

`setups`[source]

`encodes`[source]

`decodes`[source]

`class` `Categorify`[source]

`setups`[source]

`encodes`[source]

`decodes`[source]

`class` `FillStrategy`[source]

`class` `FillMissing`[source]

`class` `ReadTabBatch`[source]

`class` `TabDataLoader`[source]

Integration example

Other target types

Multi-label categories

one-hot encoded label

`setups`[source]

`encodes`[source]

`decodes`[source]

Not one-hot encoded

Regression

`setups`[source]

`encodes`[source]

`decodes`[source]

	Year	Month	Week	Day	Dayofweek	Dayofyear	Is_month_end	Is_month_start	Is_quarter_end	Is_quarter_start	Is_year_end	Is_year_start	Elapsed
0	2019.0	12.0	49.0	4.0	2.0	338.0	False	False	False	False	False	False	1575417600
1	NaN	NaN	NaN	NaN	NaN	NaN	False	False	False	False	False	False	None
2	2019.0	11.0	46.0	15.0	4.0	319.0	False	False	False	False	False	False	1573776000
3	2019.0	10.0	43.0	24.0	3.0	297.0	False	False	False	False	False	False	1571875200

	date	event	base	Afterevent	event_bw	event_fw
0	2019-12-04	False	1	5	1.0	0.0
1	2019-11-29	True	1	0	1.0	1.0
2	2019-11-15	False	2	22	1.0	0.0
3	2019-10-24	True	2	0	1.0	1.0

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	capital-loss	hours-per-week	native-country	salary
0	49	Private	101320	Assoc-acdm	12.0	Married-civ-spouse	NaN	Wife	White	Female	0	1902	40	United-States	>=50k
1	44	Private	236746	Masters	14.0	Divorced	Exec-managerial	Not-in-family	White	Male	10520	0	45	United-States	>=50k
2	38	Private	96185	HS-grad	NaN	Divorced	NaN	Unmarried	Black	Female	0	0	32	United-States	<50k
3	38	Self-emp-inc	112847	Prof-school	15.0	Married-civ-spouse	Prof-specialty	Husband	Asian-Pac-Islander	Male	0	0	40	United-States	>=50k
4	42	Self-emp-not-inc	82297	7th-8th	NaN	Married-civ-spouse	Other-service	Wife	Black	Female	0	0	50	United-States	<50k

	workclass	education	marital-status	occupation	relationship	race	education-num_na	age	fnlwgt	education-num	salary
0	Private	Some-college	Married-spouse-absent	Other-service	Not-in-family	White	False	22.999999	54472.005407	10.0	<50k
1	Private	Some-college	Never-married	Other-service	Other-relative	Black	False	21.000001	236683.999905	10.0	<50k
2	Private	Some-college	Never-married	Sales	Own-child	White	False	18.000001	163786.998406	10.0	<50k
3	Local-gov	Masters	Divorced	#na#	Unmarried	White	False	44.000000	135055.998622	14.0	<50k
4	Self-emp-inc	HS-grad	Married-civ-spouse	Adm-clerical	Husband	White	False	40.000000	207577.999886	9.0	>=50k
5	State-gov	Masters	Married-civ-spouse	Exec-managerial	Husband	White	False	37.000000	210451.999548	14.0	<50k
6	?	Bachelors	Never-married	?	Not-in-family	White	False	32.000000	169885.999453	13.0	<50k
7	Private	HS-grad	Never-married	Adm-clerical	Not-in-family	White	False	20.000000	236804.000495	9.0	<50k
8	Private	Some-college	Married-civ-spouse	Other-service	Husband	White	False	31.000000	137680.998667	10.0	<50k
9	Self-emp-inc	Some-college	Married-civ-spouse	Sales	Husband	White	False	46.000000	284798.997462	10.0	<50k

	workclass	education	marital-status	occupation	relationship	race	education-num_na	age	fnlwgt	education-num	salary
3380	Private	Some-college	Married-civ-spouse	Other-service	Husband	White	False	33.0	248584.0	10.0	<50k
3158	Local-gov	Bachelors	Married-civ-spouse	Exec-managerial	Husband	White	False	51.0	110327.0	13.0	>=50k
8904	Private	Some-college	Never-married	Exec-managerial	Not-in-family	White	False	27.0	133937.0	10.0	<50k
5912	Self-emp-not-inc	Some-college	Married-civ-spouse	Farming-fishing	Husband	White	False	48.0	164582.0	10.0	>=50k
3583	Private	Masters	Never-married	Exec-managerial	Not-in-family	White	False	39.0	49020.0	14.0	<50k
2945	Private	Bachelors	Never-married	Adm-clerical	Own-child	White	False	26.0	166051.0	13.0	<50k
204	?	HS-grad	Married-civ-spouse	#na#	Husband	White	True	60.0	174073.0	10.0	<50k
3196	Private	Some-college	Never-married	Adm-clerical	Own-child	White	False	21.0	241367.0	10.0	<50k
1183	?	Some-college	Married-civ-spouse	?	Husband	White	False	65.0	52728.0	10.0	<50k
2829	Private	Masters	Married-civ-spouse	Prof-specialty	Husband	White	False	46.0	261059.0	14.0	>=50k

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	hours-per-week	native-country	education-num_na
10000	0.466910	5	1.359596	10	1.170520	3	2	1	2	Male	40	Philippines	1
10001	-0.932292	5	1.271990	12	-0.425893	3	15	1	4	Male	40	United-States	1
10002	1.056047	5	0.161911	2	-1.224099	1	9	2	5	Female	37	United-States	1
10003	0.540552	5	-0.274100	12	-0.425893	7	2	5	5	Female	43	United-States	1
10004	0.761479	6	1.462819	9	0.372313	3	5	1	5	Male	60	United-States	1

Tabular core

Initial preprocessing

make_date[source]

add_datepart[source]

add_elapsed_times[source]

cont_cat_split[source]

df_shrink_dtypes[source]

df_shrink[source]

class Tabular[source]

class TabularPandas[source]

class TabularProc[source]

setups[source]

encodes[source]

decodes[source]

class Categorify[source]

setups[source]

encodes[source]

decodes[source]

class FillStrategy[source]

class FillMissing[source]

class ReadTabBatch[source]

class TabDataLoader[source]

Integration example

Other target types

Multi-label categories

one-hot encoded label

setups[source]

encodes[source]

decodes[source]

Not one-hot encoded

Regression

setups[source]

encodes[source]

decodes[source]

Not being used now - for multi-modal

`make_date`[source]

`add_datepart`[source]

`add_elapsed_times`[source]

`cont_cat_split`[source]

`df_shrink_dtypes`[source]

`df_shrink`[source]

`class` `Tabular`[source]

`class` `TabularPandas`[source]

`class` `TabularProc`[source]

`setups`[source]

`encodes`[source]

`decodes`[source]

`class` `Categorify`[source]

`setups`[source]

`encodes`[source]

`decodes`[source]

`class` `FillStrategy`[source]

`class` `FillMissing`[source]

`class` `ReadTabBatch`[source]

`class` `TabDataLoader`[source]

`setups`[source]

`encodes`[source]

`decodes`[source]

`setups`[source]

`encodes`[source]

`decodes`[source]