DSSM model - potential leakage between the sequence feature and the target item
emreatilgan opened this issue · 0 comments
emreatilgan commented
Hello, I use EasyRec DSSM model config for matching model and split the saved model by https://github.com/alibaba/EasyRec/blob/master/easy_rec/python/tools/split_model_pai.py after train_eval and export. However, the saved user tower model requires product_id which is the feature of the item tower. I tried some combinations of features for the user tower:
- No
product_id_seq
feature = the saved user tower model doesn't requireproduct_id
as expected. Normal values for recall_at_k metrics - With
product_id_seq
feature = the saved user tower model requiresproduct_id
in the inference and ~0.99 values for recall_at_k
Question: Using the sequence feature, could the user tower have access to the target item and learn to predict itself?
How can I use the history sequence feature and split the user and item tower successfully?
Notes: I'm using tf1.12 to be able to use split_model_pai.py
Config:
train_input_path: "data/train_processed"
eval_input_path: "data/test_processed"
model_dir: "ckpt/dssm_train_processed_negative_sample_earlystop_ckpt"
train_config {
log_step_count_steps: 100
optimizer_config: {
adam_optimizer: {
learning_rate: {
exponential_decay_learning_rate {
initial_learning_rate: 0.001
decay_steps: 1000
decay_factor: 0.5
min_learning_rate: 0.00001
}
}
}
use_moving_average: false
}
save_checkpoints_steps: 500
num_steps: 20000
}
eval_config {
metrics_set {
recall_at_topk { topk: 10 }
}
metrics_set {
recall_at_topk { topk: 50 }
}
metrics_set {
recall_at_topk { topk: 100 }
}
}
data_config {
input_fields {
input_name:'user_id'
input_type: STRING
}
input_fields {
input_name:'user_gender'
input_type: STRING
}
input_fields {
input_name:'product_id_seq'
input_type: STRING
}
input_fields {
input_name: 'product_id'
input_type: STRING
}
input_fields {
input_name: 'label'
input_type: INT32
}
label_fields: 'label'
batch_size: 4096
num_epochs: 20
prefetch_size: 32
input_type: CSVInput
separator: "\t"
negative_sampler {
input_path: 'data/negative_contents_processed'
num_sample: 1024
num_eval_sample: 1024
attr_fields: 'product_id'
item_id_field: 'product_id'
}
}
feature_config: {
features: {
input_names: 'user_id'
feature_type: IdFeature
embedding_dim: 16
hash_bucket_size: 500000
}
features: {
input_names: 'user_gender'
feature_type: IdFeature
embedding_dim: 16
hash_bucket_size: 10
}
features: {
input_names: 'product_id'
feature_type: IdFeature
embedding_dim: 16
hash_bucket_size: 400000
}
features: {
input_names: 'product_id_seq'
feature_type: SequenceFeature
separator: '|'
hash_bucket_size: 400000
embedding_dim: 16
}
}
model_config:{
model_class: "DSSM"
feature_groups: {
group_name: 'user'
feature_names: 'user_id'
feature_names: 'user_gender'
wide_deep:DEEP
sequence_features: {
group_name: "seq_fea"
allow_key_search: true
need_key_feature:true
seq_att_map: {
key: "product_id"
hist_seq: "product_id_seq"
}
}
}
feature_groups: {
group_name: "item"
feature_names: 'product_id'
wide_deep:DEEP
}
dssm {
user_tower {
id: "user_id"
dnn {
hidden_units: [256, 128, 64, 32]
}
}
item_tower {
id: "product_id"
dnn {
hidden_units: [256, 128, 64, 32]
}
}
l2_regularization: 1e-6
}
loss_type: SOFTMAX_CROSS_ENTROPY
embedding_regularization: 5e-6
}
export_config {
exporter_type: "best"
max_check_steps: 500
enable_early_stop: true
best_exporter_metric: "recall@100"
}