A wrapper of the data pipeline library "luigi".
Run pip install gokart
to install the latest version from PyPI. Documentation for the latest release is hosted on readthedocs.
Please use gokart.TaskOnKart instead of luigi.Task to define your tasks.
import gokart
class BasicTask(gokart.TaskOnKart):
def requires(self):
return TaskA()
def output(self):
# please use TaskOnKart.make_target to make Target.
return self.make_target('basic_task.csv')
def run(self):
# load data which TaskA output
texts = self.load()
# do something with texts, and make results.
# save results with the file path {self.workspace_directory}/basic_task_{unique_id}.csv
self.dump(results)
TaskOnKart.make_target
judge Target
type by the passed path extension. The following extensions are supported.
- pkl
- txt
- csv
- tsv
- gz
- json
- xml
TaskOnKart.make_model_target
and TaskOnKart.dump
are designed to save and load models like gensim.model.Word2vec.
class TrainWord2Vec(TaskOnKart):
def output(self):
# please use 'zip'.
return self.make_model_target(
'model.zip',
save_function=gensim.model.Word2Vec.save,
load_function=gensim.model.Word2Vec.load)
def run(self):
# make word2vec
self.dump(word2vec)
def requires(self):
return dict(data=LoadItemData(), model=LoadModel())
def run(self):
# pass a key in the dictionary `self.requires()`
data = self.load('data')
model = self.load('model')
def run(self):
input_data = self.load()
"""
The above line is equivalent to the following:
input_data = dict(data=self.load('data'), model=self.load('model'))
"""
def requires(self):
return LoadDataFrame()
def run(self):
data = self.load_data_frame(required_columns={'id', 'name'})