|
|
@@ -0,0 +1,568 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# coding: utf-8
|
|
|
+
|
|
|
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
|
|
+#
|
|
|
+# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
+# you may not use this file except in compliance with the License.
|
|
|
+# You may obtain a copy of the License at
|
|
|
+#
|
|
|
+# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
+#
|
|
|
+# Unless required by applicable law or agreed to in writing, software
|
|
|
+# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
+# See the License for the specific language governing permissions and
|
|
|
+# limitations under the License.
|
|
|
+
|
|
|
+evaluation = True
|
|
|
+evaluation_verbose = False
|
|
|
+
|
|
|
+OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/"
|
|
|
+DATA_BUCKET_FOLDER = "/outbrain/orig/"
|
|
|
+SPARK_TEMP_FOLDER = "/outbrain/spark-temp/"
|
|
|
+LOCAL_DATA_TFRECORDS_DIR="/outbrain/tfrecords"
|
|
|
+
|
|
|
+TEST_SET_MODE = False
|
|
|
+
|
|
|
+TENSORFLOW_HADOOP="preproc/data/tensorflow-hadoop-1.5.0.jar"
|
|
|
+
|
|
|
+from IPython.display import display
|
|
|
+
|
|
|
+import pyspark.sql.functions as F
|
|
|
+from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT
|
|
|
+
|
|
|
+from pyspark.context import SparkContext, SparkConf
|
|
|
+from pyspark.sql.session import SparkSession
|
|
|
+
|
|
|
+conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '40g').set('spark.driver.memory', '200g').set("spark.local.dir", SPARK_TEMP_FOLDER)
|
|
|
+conf.set("spark.jars", TENSORFLOW_HADOOP)
|
|
|
+conf.set("spark.sql.files.maxPartitionBytes", 805306368)
|
|
|
+
|
|
|
+sc = SparkContext(conf=conf)
|
|
|
+spark = SparkSession(sc)
|
|
|
+
|
|
|
+from pyspark.sql import Row
|
|
|
+from pyspark.sql.types import ArrayType, BinaryType, DoubleType, LongType, StringType, StructField, StructType
|
|
|
+from pyspark.sql.functions import col, when, log1p, udf
|
|
|
+
|
|
|
+import numpy as np
|
|
|
+import scipy.sparse
|
|
|
+
|
|
|
+import math
|
|
|
+import datetime
|
|
|
+import time
|
|
|
+import itertools
|
|
|
+
|
|
|
+import pickle
|
|
|
+
|
|
|
+import pandas as pd
|
|
|
+import tensorflow as tf
|
|
|
+from tensorflow_transform.tf_metadata import dataset_schema
|
|
|
+from tensorflow_transform.tf_metadata import dataset_metadata
|
|
|
+from tensorflow_transform.tf_metadata import metadata_io
|
|
|
+
|
|
|
+import trainer
|
|
|
+from trainer.features import LABEL_COLUMN, DISPLAY_ID_COLUMN, AD_ID_COLUMN, IS_LEAK_COLUMN, DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN, CATEGORICAL_COLUMNS, DOC_CATEGORICAL_MULTIVALUED_COLUMNS, BOOL_COLUMNS, INT_COLUMNS, FLOAT_COLUMNS, FLOAT_COLUMNS_LOG_BIN_TRANSFORM, FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM
|
|
|
+
|
|
|
+import argparse
|
|
|
+
|
|
|
+parser = argparse.ArgumentParser()
|
|
|
+
|
|
|
+parser.add_argument(
|
|
|
+ '--prebatch_size',
|
|
|
+ help='Prebatch size in created tfrecords',
|
|
|
+ type=int,
|
|
|
+ default=4096)
|
|
|
+
|
|
|
+parser.add_argument(
|
|
|
+ '--submission',
|
|
|
+ action='store_true',
|
|
|
+ default=False
|
|
|
+)
|
|
|
+
|
|
|
+args = parser.parse_args()
|
|
|
+
|
|
|
+batch_size = args.prebatch_size
|
|
|
+
|
|
|
+# # Feature Vector export
|
|
|
+bool_feature_names = ['event_weekend',
|
|
|
+ 'user_has_already_viewed_doc']
|
|
|
+
|
|
|
+int_feature_names = ['user_views',
|
|
|
+ 'ad_views',
|
|
|
+ 'doc_views',
|
|
|
+ 'doc_event_days_since_published',
|
|
|
+ 'doc_event_hour',
|
|
|
+ 'doc_ad_days_since_published',
|
|
|
+ ]
|
|
|
+
|
|
|
+float_feature_names = [
|
|
|
+ 'pop_ad_id',
|
|
|
+ 'pop_ad_id_conf',
|
|
|
+ 'pop_ad_id_conf_multipl',
|
|
|
+ 'pop_document_id',
|
|
|
+ 'pop_document_id_conf',
|
|
|
+ 'pop_document_id_conf_multipl',
|
|
|
+ 'pop_publisher_id',
|
|
|
+ 'pop_publisher_id_conf',
|
|
|
+ 'pop_publisher_id_conf_multipl',
|
|
|
+ 'pop_advertiser_id',
|
|
|
+ 'pop_advertiser_id_conf',
|
|
|
+ 'pop_advertiser_id_conf_multipl',
|
|
|
+ 'pop_campain_id',
|
|
|
+ 'pop_campain_id_conf',
|
|
|
+ 'pop_campain_id_conf_multipl',
|
|
|
+ 'pop_doc_event_doc_ad',
|
|
|
+ 'pop_doc_event_doc_ad_conf',
|
|
|
+ 'pop_doc_event_doc_ad_conf_multipl',
|
|
|
+ 'pop_source_id',
|
|
|
+ 'pop_source_id_conf',
|
|
|
+ 'pop_source_id_conf_multipl',
|
|
|
+ 'pop_source_id_country',
|
|
|
+ 'pop_source_id_country_conf',
|
|
|
+ 'pop_source_id_country_conf_multipl',
|
|
|
+ 'pop_entity_id',
|
|
|
+ 'pop_entity_id_conf',
|
|
|
+ 'pop_entity_id_conf_multipl',
|
|
|
+ 'pop_entity_id_country',
|
|
|
+ 'pop_entity_id_country_conf',
|
|
|
+ 'pop_entity_id_country_conf_multipl',
|
|
|
+ 'pop_topic_id',
|
|
|
+ 'pop_topic_id_conf',
|
|
|
+ 'pop_topic_id_conf_multipl',
|
|
|
+ 'pop_topic_id_country',
|
|
|
+ 'pop_topic_id_country_conf',
|
|
|
+ 'pop_topic_id_country_conf_multipl',
|
|
|
+ 'pop_category_id',
|
|
|
+ 'pop_category_id_conf',
|
|
|
+ 'pop_category_id_conf_multipl',
|
|
|
+ 'pop_category_id_country',
|
|
|
+ 'pop_category_id_country_conf',
|
|
|
+ 'pop_category_id_country_conf_multipl',
|
|
|
+ 'user_doc_ad_sim_categories',
|
|
|
+ 'user_doc_ad_sim_categories_conf',
|
|
|
+ 'user_doc_ad_sim_categories_conf_multipl',
|
|
|
+ 'user_doc_ad_sim_topics',
|
|
|
+ 'user_doc_ad_sim_topics_conf',
|
|
|
+ 'user_doc_ad_sim_topics_conf_multipl',
|
|
|
+ 'user_doc_ad_sim_entities',
|
|
|
+ 'user_doc_ad_sim_entities_conf',
|
|
|
+ 'user_doc_ad_sim_entities_conf_multipl',
|
|
|
+ 'doc_event_doc_ad_sim_categories',
|
|
|
+ 'doc_event_doc_ad_sim_categories_conf',
|
|
|
+ 'doc_event_doc_ad_sim_categories_conf_multipl',
|
|
|
+ 'doc_event_doc_ad_sim_topics',
|
|
|
+ 'doc_event_doc_ad_sim_topics_conf',
|
|
|
+ 'doc_event_doc_ad_sim_topics_conf_multipl',
|
|
|
+ 'doc_event_doc_ad_sim_entities',
|
|
|
+ 'doc_event_doc_ad_sim_entities_conf',
|
|
|
+ 'doc_event_doc_ad_sim_entities_conf_multipl'
|
|
|
+ ]
|
|
|
+
|
|
|
+# ### Configuring feature vector
|
|
|
+category_feature_names_integral = ['ad_advertiser',
|
|
|
+ 'doc_ad_category_id_1',
|
|
|
+ 'doc_ad_category_id_2',
|
|
|
+ 'doc_ad_category_id_3',
|
|
|
+ 'doc_ad_topic_id_1',
|
|
|
+ 'doc_ad_topic_id_2',
|
|
|
+ 'doc_ad_topic_id_3',
|
|
|
+ 'doc_ad_entity_id_1',
|
|
|
+ 'doc_ad_entity_id_2',
|
|
|
+ 'doc_ad_entity_id_3',
|
|
|
+ 'doc_ad_entity_id_4',
|
|
|
+ 'doc_ad_entity_id_5',
|
|
|
+ 'doc_ad_entity_id_6',
|
|
|
+ 'doc_ad_publisher_id',
|
|
|
+ 'doc_ad_source_id',
|
|
|
+ 'doc_event_category_id_1',
|
|
|
+ 'doc_event_category_id_2',
|
|
|
+ 'doc_event_category_id_3',
|
|
|
+ 'doc_event_topic_id_1',
|
|
|
+ 'doc_event_topic_id_2',
|
|
|
+ 'doc_event_topic_id_3',
|
|
|
+ 'doc_event_entity_id_1',
|
|
|
+ 'doc_event_entity_id_2',
|
|
|
+ 'doc_event_entity_id_3',
|
|
|
+ 'doc_event_entity_id_4',
|
|
|
+ 'doc_event_entity_id_5',
|
|
|
+ 'doc_event_entity_id_6',
|
|
|
+ 'doc_event_publisher_id',
|
|
|
+ 'doc_event_source_id',
|
|
|
+ 'event_country',
|
|
|
+ 'event_country_state',
|
|
|
+ 'event_geo_location',
|
|
|
+ 'event_hour',
|
|
|
+ 'event_platform',
|
|
|
+ 'traffic_source']
|
|
|
+
|
|
|
+
|
|
|
+feature_vector_labels_integral = bool_feature_names + int_feature_names + float_feature_names + category_feature_names_integral
|
|
|
+
|
|
|
+if args.submission:
|
|
|
+ train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral'
|
|
|
+else:
|
|
|
+ train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral_eval'
|
|
|
+
|
|
|
+# ## Exporting integral feature vectors to CSV
|
|
|
+train_feature_vectors_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+train_feature_vector_gcs_folder_name)
|
|
|
+train_feature_vectors_exported_df.take(3)
|
|
|
+
|
|
|
+integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral
|
|
|
+
|
|
|
+CSV_ORDERED_COLUMNS = ['label','display_id','ad_id','doc_id','doc_event_id','is_leak','event_weekend',
|
|
|
+ 'user_has_already_viewed_doc','user_views','ad_views','doc_views',
|
|
|
+ 'doc_event_days_since_published','doc_event_hour','doc_ad_days_since_published',
|
|
|
+ 'pop_ad_id','pop_ad_id_conf',
|
|
|
+ 'pop_ad_id_conf_multipl','pop_document_id','pop_document_id_conf',
|
|
|
+ 'pop_document_id_conf_multipl','pop_publisher_id','pop_publisher_id_conf',
|
|
|
+ 'pop_publisher_id_conf_multipl','pop_advertiser_id','pop_advertiser_id_conf',
|
|
|
+ 'pop_advertiser_id_conf_multipl','pop_campain_id','pop_campain_id_conf',
|
|
|
+ 'pop_campain_id_conf_multipl','pop_doc_event_doc_ad','pop_doc_event_doc_ad_conf',
|
|
|
+ 'pop_doc_event_doc_ad_conf_multipl','pop_source_id','pop_source_id_conf',
|
|
|
+ 'pop_source_id_conf_multipl','pop_source_id_country','pop_source_id_country_conf',
|
|
|
+ 'pop_source_id_country_conf_multipl','pop_entity_id','pop_entity_id_conf',
|
|
|
+ 'pop_entity_id_conf_multipl','pop_entity_id_country','pop_entity_id_country_conf',
|
|
|
+ 'pop_entity_id_country_conf_multipl','pop_topic_id','pop_topic_id_conf',
|
|
|
+ 'pop_topic_id_conf_multipl','pop_topic_id_country','pop_topic_id_country_conf',
|
|
|
+ 'pop_topic_id_country_conf_multipl','pop_category_id','pop_category_id_conf',
|
|
|
+ 'pop_category_id_conf_multipl','pop_category_id_country','pop_category_id_country_conf',
|
|
|
+ 'pop_category_id_country_conf_multipl','user_doc_ad_sim_categories',
|
|
|
+ 'user_doc_ad_sim_categories_conf','user_doc_ad_sim_categories_conf_multipl',
|
|
|
+ 'user_doc_ad_sim_topics','user_doc_ad_sim_topics_conf','user_doc_ad_sim_topics_conf_multipl',
|
|
|
+ 'user_doc_ad_sim_entities','user_doc_ad_sim_entities_conf','user_doc_ad_sim_entities_conf_multipl',
|
|
|
+ 'doc_event_doc_ad_sim_categories','doc_event_doc_ad_sim_categories_conf',
|
|
|
+ 'doc_event_doc_ad_sim_categories_conf_multipl','doc_event_doc_ad_sim_topics',
|
|
|
+ 'doc_event_doc_ad_sim_topics_conf','doc_event_doc_ad_sim_topics_conf_multipl',
|
|
|
+ 'doc_event_doc_ad_sim_entities','doc_event_doc_ad_sim_entities_conf',
|
|
|
+ 'doc_event_doc_ad_sim_entities_conf_multipl','ad_advertiser','doc_ad_category_id_1',
|
|
|
+ 'doc_ad_category_id_2','doc_ad_category_id_3','doc_ad_topic_id_1','doc_ad_topic_id_2',
|
|
|
+ 'doc_ad_topic_id_3','doc_ad_entity_id_1','doc_ad_entity_id_2','doc_ad_entity_id_3',
|
|
|
+ 'doc_ad_entity_id_4','doc_ad_entity_id_5','doc_ad_entity_id_6','doc_ad_publisher_id',
|
|
|
+ 'doc_ad_source_id','doc_event_category_id_1','doc_event_category_id_2','doc_event_category_id_3',
|
|
|
+ 'doc_event_topic_id_1','doc_event_topic_id_2','doc_event_topic_id_3','doc_event_entity_id_1',
|
|
|
+ 'doc_event_entity_id_2','doc_event_entity_id_3','doc_event_entity_id_4','doc_event_entity_id_5',
|
|
|
+ 'doc_event_entity_id_6','doc_event_publisher_id','doc_event_source_id','event_country',
|
|
|
+ 'event_country_state','event_geo_location','event_hour','event_platform','traffic_source']
|
|
|
+
|
|
|
+FEAT_CSV_ORDERED_COLUMNS = ['event_weekend',
|
|
|
+ 'user_has_already_viewed_doc','user_views','ad_views','doc_views',
|
|
|
+ 'doc_event_days_since_published','doc_event_hour','doc_ad_days_since_published',
|
|
|
+ 'pop_ad_id','pop_ad_id_conf',
|
|
|
+ 'pop_ad_id_conf_multipl','pop_document_id','pop_document_id_conf',
|
|
|
+ 'pop_document_id_conf_multipl','pop_publisher_id','pop_publisher_id_conf',
|
|
|
+ 'pop_publisher_id_conf_multipl','pop_advertiser_id','pop_advertiser_id_conf',
|
|
|
+ 'pop_advertiser_id_conf_multipl','pop_campain_id','pop_campain_id_conf',
|
|
|
+ 'pop_campain_id_conf_multipl','pop_doc_event_doc_ad','pop_doc_event_doc_ad_conf',
|
|
|
+ 'pop_doc_event_doc_ad_conf_multipl','pop_source_id','pop_source_id_conf',
|
|
|
+ 'pop_source_id_conf_multipl','pop_source_id_country','pop_source_id_country_conf',
|
|
|
+ 'pop_source_id_country_conf_multipl','pop_entity_id','pop_entity_id_conf',
|
|
|
+ 'pop_entity_id_conf_multipl','pop_entity_id_country','pop_entity_id_country_conf',
|
|
|
+ 'pop_entity_id_country_conf_multipl','pop_topic_id','pop_topic_id_conf',
|
|
|
+ 'pop_topic_id_conf_multipl','pop_topic_id_country','pop_topic_id_country_conf',
|
|
|
+ 'pop_topic_id_country_conf_multipl','pop_category_id','pop_category_id_conf',
|
|
|
+ 'pop_category_id_conf_multipl','pop_category_id_country','pop_category_id_country_conf',
|
|
|
+ 'pop_category_id_country_conf_multipl','user_doc_ad_sim_categories',
|
|
|
+ 'user_doc_ad_sim_categories_conf','user_doc_ad_sim_categories_conf_multipl',
|
|
|
+ 'user_doc_ad_sim_topics','user_doc_ad_sim_topics_conf','user_doc_ad_sim_topics_conf_multipl',
|
|
|
+ 'user_doc_ad_sim_entities','user_doc_ad_sim_entities_conf','user_doc_ad_sim_entities_conf_multipl',
|
|
|
+ 'doc_event_doc_ad_sim_categories','doc_event_doc_ad_sim_categories_conf',
|
|
|
+ 'doc_event_doc_ad_sim_categories_conf_multipl','doc_event_doc_ad_sim_topics',
|
|
|
+ 'doc_event_doc_ad_sim_topics_conf','doc_event_doc_ad_sim_topics_conf_multipl',
|
|
|
+ 'doc_event_doc_ad_sim_entities','doc_event_doc_ad_sim_entities_conf',
|
|
|
+ 'doc_event_doc_ad_sim_entities_conf_multipl','ad_advertiser','doc_ad_category_id_1',
|
|
|
+ 'doc_ad_category_id_2','doc_ad_category_id_3','doc_ad_topic_id_1','doc_ad_topic_id_2',
|
|
|
+ 'doc_ad_topic_id_3','doc_ad_entity_id_1','doc_ad_entity_id_2','doc_ad_entity_id_3',
|
|
|
+ 'doc_ad_entity_id_4','doc_ad_entity_id_5','doc_ad_entity_id_6','doc_ad_publisher_id',
|
|
|
+ 'doc_ad_source_id','doc_event_category_id_1','doc_event_category_id_2','doc_event_category_id_3',
|
|
|
+ 'doc_event_topic_id_1','doc_event_topic_id_2','doc_event_topic_id_3','doc_event_entity_id_1',
|
|
|
+ 'doc_event_entity_id_2','doc_event_entity_id_3','doc_event_entity_id_4','doc_event_entity_id_5',
|
|
|
+ 'doc_event_entity_id_6','doc_event_publisher_id','doc_event_source_id','event_country',
|
|
|
+ 'event_country_state','event_geo_location','event_hour','event_platform','traffic_source']
|
|
|
+
|
|
|
+def to_array(col):
|
|
|
+ def to_array_(v):
|
|
|
+ return v.toArray().tolist()
|
|
|
+ # Important: asNondeterministic requires Spark 2.3 or later
|
|
|
+ # It can be safely removed i.e.
|
|
|
+ # return udf(to_array_, ArrayType(DoubleType()))(col)
|
|
|
+ # but at the cost of decreased performance
|
|
|
+
|
|
|
+ return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)
|
|
|
+
|
|
|
+
|
|
|
+CONVERT_TO_INT = ['doc_ad_category_id_1',
|
|
|
+ 'doc_ad_category_id_2','doc_ad_category_id_3','doc_ad_topic_id_1','doc_ad_topic_id_2',
|
|
|
+ 'doc_ad_topic_id_3','doc_ad_entity_id_1','doc_ad_entity_id_2','doc_ad_entity_id_3',
|
|
|
+ 'doc_ad_entity_id_4','doc_ad_entity_id_5','doc_ad_entity_id_6',
|
|
|
+ 'doc_ad_source_id','doc_event_category_id_1','doc_event_category_id_2','doc_event_category_id_3',
|
|
|
+ 'doc_event_topic_id_1','doc_event_topic_id_2','doc_event_topic_id_3','doc_event_entity_id_1',
|
|
|
+ 'doc_event_entity_id_2','doc_event_entity_id_3','doc_event_entity_id_4','doc_event_entity_id_5', 'doc_event_entity_id_6']
|
|
|
+
|
|
|
+
|
|
|
+def format_number(element, name):
|
|
|
+ if name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
|
|
|
+ return element.cast("int")
|
|
|
+ elif name in CONVERT_TO_INT:
|
|
|
+ return element.cast("int")
|
|
|
+ else:
|
|
|
+ return element
|
|
|
+
|
|
|
+def to_array_with_none(col):
|
|
|
+ def to_array_with_none_(v):
|
|
|
+ tmp= np.full((v.size,), fill_value=None, dtype=np.float64)
|
|
|
+ tmp[v.indices] = v.values
|
|
|
+ return tmp.tolist()
|
|
|
+ # Important: asNondeterministic requires Spark 2.3 or later
|
|
|
+ # It can be safely removed i.e.
|
|
|
+ # return udf(to_array_, ArrayType(DoubleType()))(col)
|
|
|
+ # but at the cost of decreased performance
|
|
|
+
|
|
|
+ return udf(to_array_with_none_, ArrayType(DoubleType())).asNondeterministic()(col)
|
|
|
+
|
|
|
+@udf
|
|
|
+def count_value(x):
|
|
|
+ from collections import Counter
|
|
|
+ tmp = Counter(x).most_common(2)
|
|
|
+ if not tmp or np.isnan(tmp[0][0]):
|
|
|
+ return 0
|
|
|
+ return float(tmp[0][0])
|
|
|
+
|
|
|
+def replace_with_most_frequent(most_value):
|
|
|
+ return udf( lambda x: most_value if not x or np.isnan(x) else x)
|
|
|
+
|
|
|
+
|
|
|
+train_feature_vectors_integral_csv_rdd_df = train_feature_vectors_exported_df.select('label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'feature_vector').withColumn('is_leak', F.lit(-1)).withColumn("featvec", to_array("feature_vector")).select(['label'] + ['display_id'] + ['ad_id'] + ['document_id'] + ['document_id_event'] + ['is_leak'] + [format_number(element, FEAT_CSV_ORDERED_COLUMNS[index]).alias(FEAT_CSV_ORDERED_COLUMNS[index]) for index, element in enumerate([col("featvec")[i] for i in range(len(feature_vector_labels_integral))])]).replace(float('nan'), 0)
|
|
|
+
|
|
|
+if args.submission:
|
|
|
+ test_validation_feature_vector_gcs_folder_name = 'test_feature_vectors_integral'
|
|
|
+else:
|
|
|
+ test_validation_feature_vector_gcs_folder_name = 'validation_feature_vectors_integral'
|
|
|
+
|
|
|
+# ## Exporting integral feature vectors
|
|
|
+test_validation_feature_vectors_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+test_validation_feature_vector_gcs_folder_name)
|
|
|
+test_validation_feature_vectors_exported_df.take(3)
|
|
|
+
|
|
|
+test_validation_feature_vectors_integral_csv_rdd_df = test_validation_feature_vectors_exported_df.select(
|
|
|
+ 'label', 'display_id', 'ad_id', 'document_id', 'document_id_event',
|
|
|
+ 'is_leak', 'feature_vector').withColumn("featvec", to_array("feature_vector")).select(['label'] + ['display_id'] + ['ad_id'] + ['document_id'] + ['document_id_event'] + ['is_leak'] + [format_number(element, FEAT_CSV_ORDERED_COLUMNS[index]).alias(FEAT_CSV_ORDERED_COLUMNS[index]) for index, element in enumerate([col("featvec")[i] for i in range(len(feature_vector_labels_integral))])]).replace(float('nan'), 0)
|
|
|
+
|
|
|
+def make_spec(output_dir, batch_size=None):
|
|
|
+ fixed_shape = [batch_size,1] if batch_size is not None else []
|
|
|
+ spec = {}
|
|
|
+ spec[LABEL_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
|
|
|
+ spec[DISPLAY_ID_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
|
|
|
+ spec[IS_LEAK_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
|
|
|
+ spec[DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
|
|
|
+ for name in BOOL_COLUMNS:
|
|
|
+ spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
|
|
|
+ for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM+FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
|
|
|
+ spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
|
|
|
+ for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
|
|
|
+ spec[name + '_binned'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
|
|
|
+ for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
|
|
|
+ spec[name + '_binned'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
|
|
|
+ spec[name + '_log_01scaled'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
|
|
|
+ for name in INT_COLUMNS:
|
|
|
+ spec[name + '_log_int'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
|
|
|
+ spec[name + '_log_01scaled'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
|
|
|
+ for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
|
|
|
+ spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
|
|
|
+ for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
|
|
|
+ shape = fixed_shape[:-1]+[len(DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category])]
|
|
|
+ spec[multi_category] = tf.FixedLenFeature(shape=shape, dtype=tf.int64)
|
|
|
+ metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(spec))
|
|
|
+ metadata_io.write_metadata(metadata, output_dir)
|
|
|
+
|
|
|
+# write out tfrecords meta
|
|
|
+make_spec(LOCAL_DATA_TFRECORDS_DIR + '/transformed_metadata', batch_size=batch_size)
|
|
|
+
|
|
|
+def log2_1p(x):
|
|
|
+ return np.log1p(x) / np.log(2.0)
|
|
|
+
|
|
|
+# calculate min and max stats for the given dataframes all in one go
|
|
|
+def compute_min_max_logs(df):
|
|
|
+ print(str(datetime.datetime.now()) + '\tComputing min and max')
|
|
|
+ min_logs = {}
|
|
|
+ max_logs = {}
|
|
|
+ all_dict = {}
|
|
|
+ float_expr = []
|
|
|
+ for name in trainer.features.FLOAT_COLUMNS_LOG_BIN_TRANSFORM + trainer.features.INT_COLUMNS:
|
|
|
+ float_expr.append(F.min(name))
|
|
|
+ float_expr.append(F.max(name))
|
|
|
+ floatDf = all_df.agg(*float_expr).collect()
|
|
|
+ for name in trainer.features.FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
|
|
|
+ minAgg = floatDf[0]["min("+name+")"]
|
|
|
+ maxAgg = floatDf[0]["max("+name+")"]
|
|
|
+ min_logs[name + '_log_01scaled'] = log2_1p(minAgg*1000)
|
|
|
+ max_logs[name + '_log_01scaled'] = log2_1p(maxAgg*1000)
|
|
|
+ for name in trainer.features.INT_COLUMNS:
|
|
|
+ minAgg = floatDf[0]["min("+name+")"]
|
|
|
+ maxAgg = floatDf[0]["max("+name+")"]
|
|
|
+ min_logs[name + '_log_01scaled'] = log2_1p(minAgg)
|
|
|
+ max_logs[name + '_log_01scaled'] = log2_1p(maxAgg)
|
|
|
+
|
|
|
+ return min_logs, max_logs
|
|
|
+
|
|
|
+
|
|
|
+all_df = test_validation_feature_vectors_integral_csv_rdd_df.union(train_feature_vectors_integral_csv_rdd_df)
|
|
|
+min_logs, max_logs = compute_min_max_logs(all_df)
|
|
|
+
|
|
|
+if args.submission:
|
|
|
+ train_output_string = '/sub_train'
|
|
|
+ eval_output_string = '/test'
|
|
|
+else:
|
|
|
+ train_output_string = '/train'
|
|
|
+ eval_output_string = '/eval'
|
|
|
+
|
|
|
+path = LOCAL_DATA_TFRECORDS_DIR
|
|
|
+
|
|
|
+def create_tf_example_spark(df, min_logs, max_logs):
|
|
|
+ result = {}
|
|
|
+ result[LABEL_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[LABEL_COLUMN].to_list()))
|
|
|
+ result[DISPLAY_ID_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[DISPLAY_ID_COLUMN].to_list()))
|
|
|
+ result[IS_LEAK_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[IS_LEAK_COLUMN].to_list()))
|
|
|
+ encoded_value = df[DISPLAY_ID_COLUMN].multiply(10).add(df[IS_LEAK_COLUMN].clip(lower=0)).to_list()
|
|
|
+ result[DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=encoded_value))
|
|
|
+ for name in FLOAT_COLUMNS:
|
|
|
+ value = df[name].to_list()
|
|
|
+ result[name] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
|
|
|
+ for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
|
|
|
+ value = df[name].multiply(10).astype('int64').to_list()
|
|
|
+ result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
|
|
|
+ for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
|
|
|
+ value_prelim = df[name].multiply(1000).apply(np.log1p).multiply(1./np.log(2.0))
|
|
|
+ value = value_prelim.astype('int64').to_list()
|
|
|
+ result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
|
|
|
+ nn = name + '_log_01scaled'
|
|
|
+ value = value_prelim.add(-min_logs[nn]).multiply(1./(max_logs[nn]-min_logs[nn])).to_list()
|
|
|
+ result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
|
|
|
+ for name in INT_COLUMNS:
|
|
|
+ value_prelim = df[name].apply(np.log1p).multiply(1./np.log(2.0))
|
|
|
+ value = value_prelim.astype('int64').to_list()
|
|
|
+ result[name + '_log_int'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
|
|
|
+ nn = name + '_log_01scaled'
|
|
|
+ value = value_prelim.add(-min_logs[nn]).multiply(1./(max_logs[nn]-min_logs[nn])).to_list()
|
|
|
+ result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
|
|
|
+ for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
|
|
|
+ value = df[name].fillna(0).astype('int64').to_list()
|
|
|
+ result[name] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
|
|
|
+ for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
|
|
|
+ values = []
|
|
|
+ for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category]:
|
|
|
+ values = values + [df[category].to_numpy()]
|
|
|
+ # need to transpose the series so they will be parsed correctly by the FixedLenFeature
|
|
|
+ # we can pass in a single series here; they'll be reshaped to [batch_size, num_values]
|
|
|
+ # when parsed from the TFRecord
|
|
|
+ value = np.stack(values, axis=1).flatten().tolist()
|
|
|
+ result[multi_category] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
|
|
|
+ tf_example = tf.train.Example(features=tf.train.Features(feature=result))
|
|
|
+ return tf_example
|
|
|
+
|
|
|
+def _transform_to_tfrecords(rdds):
|
|
|
+ csv = pd.DataFrame(list(rdds), columns=CSV_ORDERED_COLUMNS)
|
|
|
+ num_rows = len(csv.index)
|
|
|
+ examples = []
|
|
|
+ for start_ind in range(0,num_rows,batch_size if batch_size is not None else 1): # for each batch
|
|
|
+ if start_ind + batch_size - 1 > num_rows: # if we'd run out of rows
|
|
|
+ csv_slice = csv.iloc[start_ind:]
|
|
|
+ # drop the remainder
|
|
|
+ print("last Example has: ", len(csv_slice))
|
|
|
+ examples.append((create_tf_example_spark(csv_slice, min_logs, max_logs), len(csv_slice)))
|
|
|
+ return examples
|
|
|
+ else:
|
|
|
+ csv_slice = csv.iloc[start_ind:start_ind+(batch_size if batch_size is not None else 1)]
|
|
|
+ examples.append((create_tf_example_spark(csv_slice, min_logs, max_logs), batch_size))
|
|
|
+ return examples
|
|
|
+
|
|
|
+from pyspark import TaskContext
|
|
|
+max_partition_num = 30
|
|
|
+
|
|
|
+def _transform_to_slices(rdds):
|
|
|
+ taskcontext = TaskContext.get()
|
|
|
+ partitionid = taskcontext.partitionId()
|
|
|
+ csv = pd.DataFrame(list(rdds), columns=CSV_ORDERED_COLUMNS)
|
|
|
+ num_rows = len(csv.index)
|
|
|
+ print("working with partition: ", partitionid, max_partition_num, num_rows)
|
|
|
+ examples = []
|
|
|
+ for start_ind in range(0,num_rows,batch_size if batch_size is not None else 1): # for each batch
|
|
|
+ if start_ind + batch_size - 1 > num_rows: # if we'd run out of rows
|
|
|
+ csv_slice = csv.iloc[start_ind:]
|
|
|
+ print("last Example has: ", len(csv_slice), partitionid)
|
|
|
+ examples.append((csv_slice, len(csv_slice)))
|
|
|
+ return examples
|
|
|
+ else:
|
|
|
+ csv_slice = csv.iloc[start_ind:start_ind+(batch_size if batch_size is not None else 1)]
|
|
|
+ examples.append((csv_slice, len(csv_slice)))
|
|
|
+ return examples
|
|
|
+
|
|
|
+def _transform_to_tfrecords_from_slices(rdds):
|
|
|
+ examples = []
|
|
|
+ for slice in rdds:
|
|
|
+ if len(slice[0]) != batch_size:
|
|
|
+ print("slice size is not correct, dropping: ", len(slice[0]))
|
|
|
+ else:
|
|
|
+ examples.append((bytearray((create_tf_example_spark(slice[0], min_logs, max_logs)).SerializeToString()), None))
|
|
|
+ return examples
|
|
|
+
|
|
|
+def _transform_to_tfrecords_from_reslice(rdds):
|
|
|
+ examples = []
|
|
|
+ all_dataframes = pd.DataFrame([])
|
|
|
+ for slice in rdds:
|
|
|
+ all_dataframes = all_dataframes.append(slice[0])
|
|
|
+ num_rows = len(all_dataframes.index)
|
|
|
+ examples = []
|
|
|
+ for start_ind in range(0,num_rows,batch_size if batch_size is not None else 1): # for each batch
|
|
|
+ if start_ind + batch_size - 1 > num_rows: # if we'd run out of rows
|
|
|
+ csv_slice = all_dataframes.iloc[start_ind:]
|
|
|
+ if TEST_SET_MODE:
|
|
|
+ remain_len = batch_size - len(csv_slice)
|
|
|
+ (m, n) = divmod(remain_len, len(csv_slice))
|
|
|
+ print("remainder: ", len(csv_slice), remain_len, m, n)
|
|
|
+ if m:
|
|
|
+ for i in range(m):
|
|
|
+ csv_slice = csv_slice.append(csv_slice)
|
|
|
+ csv_slice = csv_slice.append(csv_slice.iloc[:n])
|
|
|
+ print("after fill remainder: ", len(csv_slice))
|
|
|
+ examples.append((bytearray((create_tf_example_spark(csv_slice, min_logs, max_logs)).SerializeToString()), None))
|
|
|
+ return examples
|
|
|
+ # drop the remainder
|
|
|
+ print("dropping remainder: ", len(csv_slice))
|
|
|
+ return examples
|
|
|
+ else:
|
|
|
+ csv_slice = all_dataframes.iloc[start_ind:start_ind+(batch_size if batch_size is not None else 1)]
|
|
|
+ examples.append((bytearray((create_tf_example_spark(csv_slice, min_logs, max_logs)).SerializeToString()), None))
|
|
|
+ return examples
|
|
|
+
|
|
|
+TEST_SET_MODE = False
|
|
|
+train_features = train_feature_vectors_integral_csv_rdd_df.coalesce(30).rdd.mapPartitions(_transform_to_slices)
|
|
|
+cached_train_features = train_features.cache()
|
|
|
+cached_train_features.count()
|
|
|
+train_full = cached_train_features.filter(lambda x: x[1] == batch_size)
|
|
|
+# split out slies where we don't have a full batch so that we can reslice them so we only drop mininal rows
|
|
|
+train_not_full = cached_train_features.filter(lambda x: x[1] < batch_size)
|
|
|
+train_examples_full = train_full.mapPartitions(_transform_to_tfrecords_from_slices)
|
|
|
+train_left = train_not_full.coalesce(1).mapPartitions(_transform_to_tfrecords_from_reslice)
|
|
|
+all_train = train_examples_full.union(train_left)
|
|
|
+
|
|
|
+TEST_SET_MODE = True
|
|
|
+valid_features = test_validation_feature_vectors_integral_csv_rdd_df.coalesce(30).rdd.mapPartitions(_transform_to_slices)
|
|
|
+cached_valid_features = valid_features.cache()
|
|
|
+cached_valid_features.count()
|
|
|
+valid_full = cached_valid_features.filter(lambda x: x[1] == batch_size)
|
|
|
+valid_not_full = cached_valid_features.filter(lambda x: x[1] < batch_size)
|
|
|
+valid_examples_full = valid_full.mapPartitions(_transform_to_tfrecords_from_slices)
|
|
|
+valid_left = valid_not_full.coalesce(1).mapPartitions(_transform_to_tfrecords_from_reslice)
|
|
|
+all_valid = valid_examples_full.union(valid_left)
|
|
|
+
|
|
|
+all_train.saveAsNewAPIHadoopFile(LOCAL_DATA_TFRECORDS_DIR + train_output_string, "org.tensorflow.hadoop.io.TFRecordFileOutputFormat",
|
|
|
+ keyClass="org.apache.hadoop.io.BytesWritable",
|
|
|
+ valueClass="org.apache.hadoop.io.NullWritable")
|
|
|
+
|
|
|
+all_valid.saveAsNewAPIHadoopFile(LOCAL_DATA_TFRECORDS_DIR + eval_output_string, "org.tensorflow.hadoop.io.TFRecordFileOutputFormat",
|
|
|
+ keyClass="org.apache.hadoop.io.BytesWritable",
|
|
|
+ valueClass="org.apache.hadoop.io.NullWritable")
|
|
|
+
|
|
|
+spark.stop()
|
|
|
+
|