Browse Source

Merge pull request #437 from NVIDIA/nvpstr/85d346c0b

[WideAndDeep] Fixes (to alerts reported by lgtm)
PrzemekS 6 years ago
parent
commit
9492dc7cb7

+ 1 - 6
TensorFlow/Recommendation/WideAndDeep/README.md

@@ -25,7 +25,6 @@ This repository provides a script and recipe to train the Wide and Deep Recommen
     * [Getting the data](#getting-the-data)
         * [Dataset guidelines](#dataset-guidelines)
     * [Training process](#training-process)
-    * [Deploying the Wide & Deep model using Triton Inference Server](#deploying-the-wide-deep-model-using-triton-inference-server)
 - [Performance](#performance)
     * [Benchmarking](#benchmarking)
         * [Training performance benchmark](#training-performance-benchmark)
@@ -181,7 +180,7 @@ To train your model using mixed precision with Tensor Cores or using FP32, perfo
 
 ```
 git clone https://github.com/NVIDIA/DeepLearningExamples
-cd DeepLearningExamples/TensorFlow/Recommendation/WideDeep
+cd DeepLearningExamples/TensorFlow/Recommendation/WideAndDeep
 ```
 
 2.  Download the Outbrain dataset.
@@ -326,10 +325,6 @@ The training log will contain information about:
 
 Checkpoints are stored at the end of every `--save_checkpoints_steps` at the `--model_dir` location.
 
-### Deploying the Wide & Deep model using Triton Inference Server
-
-This repository does not contain code for deploying the model using Triton Inference Server. The details of such deployment together with obtained performance numbers was discussed on the [blog post](https://devblogs.nvidia.com/accelerating-wide-deep-recommender-inference-on-gpus/).
-
 ## Performance
 
 ### Benchmarking

+ 0 - 5
TensorFlow/Recommendation/WideAndDeep/dataflow_preprocess.py

@@ -18,14 +18,9 @@ from __future__ import print_function
 
 import argparse
 import datetime
-import os
-import random
-import subprocess
 import sys
-from joblib import Parallel, delayed
 
 import outbrain_transform
-import path_constants
 
 import tensorflow as tf
 import glob

+ 1 - 41
TensorFlow/Recommendation/WideAndDeep/outbrain_transform.py

@@ -22,9 +22,8 @@ from tensorflow_transform.tf_metadata import dataset_schema
 from tensorflow_transform.tf_metadata import dataset_metadata
 from tensorflow_transform.tf_metadata import metadata_io
 import numpy as np
-import pandas as pd
 
-from trainer.features import LABEL_COLUMN, DISPLAY_ID_COLUMN, AD_ID_COLUMN, IS_LEAK_COLUMN, DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN, CATEGORICAL_COLUMNS, DOC_CATEGORICAL_MULTIVALUED_COLUMNS, BOOL_COLUMNS, INT_COLUMNS, FLOAT_COLUMNS, FLOAT_COLUMNS_LOG_BIN_TRANSFORM, FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM
+from trainer.features import LABEL_COLUMN, DISPLAY_ID_COLUMN, IS_LEAK_COLUMN, DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN, CATEGORICAL_COLUMNS, DOC_CATEGORICAL_MULTIVALUED_COLUMNS, BOOL_COLUMNS, INT_COLUMNS, FLOAT_COLUMNS, FLOAT_COLUMNS_LOG_BIN_TRANSFORM, FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM
 
 RENAME_COLUMNS = False
 
@@ -95,42 +94,6 @@ def make_spec(output_dir, batch_size=None):
 	
   metadata_io.write_metadata(metadata, output_dir)
 
-def make_input_schema(mode=tf.contrib.learn.ModeKeys.TRAIN, batch_size=None):
-  """Input schema definition.
-
-  Args:
-    mode: tf.contrib.learn.ModeKeys specifying if the schema is being used for
-      train/eval or prediction.
-    batch_size: None if not explicitly batched (for FixedLenFeature size of []), 
-      otherwise the number of elements to assume will be grouped (size of [batch_size])
-  Returns:
-    A `Schema` object.
-  """
-  fixed_shape = [batch_size] if batch_size is not None else []
-  result = {}
-  result[LABEL_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64)
-  result[DISPLAY_ID_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32)
-  #result[AD_ID_COLUMN] = tf.VarLenFeature(dtype=tf.float32)
-  result[IS_LEAK_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64)
-  for name in BOOL_COLUMNS:
-    #result[name] = tf.VarLenFeature(dtype=tf.int64)
-    result[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=0.0)
-  #TODO: Create dummy features that indicates whether any of the numeric features is null 
-  #(currently default 0 value might introduce noise)
-  for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM+FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
-    result[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=0.0)  
-  for name in INT_COLUMNS:
-    result[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=0.0)
-  for name in CATEGORICAL_COLUMNS:
-    result[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=0.0)
-    #result[name] = tf.VarLenFeature(dtype=tf.float32)
-  for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
-    for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category]:
-      result[category] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=0.0)
-      #result[category] = tf.VarLenFeature(dtype=tf.float32)
-
-  return dataset_schema.from_feature_spec(result)
-
 def tf_log2_1p(x):
   return tf.log1p(x) / tf.log(2.0)
 
@@ -163,9 +126,6 @@ def scale_to_0_1(val, minv, maxv):
   return (val - minv) / (maxv - minv)
 
 def create_tf_example(df, min_logs, max_logs):
-  names = CSV_ORDERED_COLUMNS
-  #columns_dict = dict(zip(names, row))
-  
   result = {}
   result[LABEL_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[LABEL_COLUMN].to_list()))
   result[DISPLAY_ID_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[DISPLAY_ID_COLUMN].to_list()))

+ 0 - 47
TensorFlow/Recommendation/WideAndDeep/path_constants.py

@@ -1,47 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Copyright 2016 Google Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""File paths for the Criteo Classification pipeline.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-TEMP_DIR = 'tmp'
-TRANSFORM_FN_DIR = 'transform_fn'
-RAW_METADATA_DIR = 'raw_metadata'
-TRANSFORMED_METADATA_DIR = 'transformed_metadata'
-TRANSFORMED_TRAIN_DATA_FILE_PREFIX = 'features_train'
-TRANSFORMED_EVAL_DATA_FILE_PREFIX = 'features_eval'
-TRANSFORMED_PREDICT_DATA_FILE_PREFIX = 'features_predict'
-TRAIN_RESULTS_FILE = 'train_results'
-DEPLOY_SAVED_MODEL_DIR = 'saved_model'
-MODEL_EVALUATIONS_FILE = 'model_evaluations'
-BATCH_PREDICTION_RESULTS_FILE = 'batch_prediction_results'

+ 1 - 1
TensorFlow/Recommendation/WideAndDeep/preproc/preproc1.py

@@ -19,7 +19,7 @@ OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/"
 DATA_BUCKET_FOLDER = "/outbrain/orig/"
 SPARK_TEMP_FOLDER = "/outbrain/spark-temp/"
 
-from pyspark.sql.types import *
+from pyspark.sql.types import IntegerType, StringType, StructType, StructField
 import pyspark.sql.functions as F
 
 from pyspark.context import SparkContext, SparkConf

+ 1 - 8
TensorFlow/Recommendation/WideAndDeep/preproc/preproc2.py

@@ -19,17 +19,10 @@ OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/"
 DATA_BUCKET_FOLDER = "/outbrain/orig/"
 SPARK_TEMP_FOLDER = "/outbrain/spark-temp/"
 
-from IPython.display import display
-
-from pyspark.sql.types import *
+from pyspark.sql.types import IntegerType, StringType, StructType, StructField, TimestampType, FloatType, ArrayType, MapType
 import pyspark.sql.functions as F
 
-from pyspark.sql import DataFrameWriter
-
-import numpy as np
-
 import math
-import datetime
 import time
 
 import random

+ 8 - 21
TensorFlow/Recommendation/WideAndDeep/preproc/preproc3.py

@@ -21,13 +21,9 @@ OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/"
 DATA_BUCKET_FOLDER = "/outbrain/orig/"
 SPARK_TEMP_FOLDER = "/outbrain/spark-temp/"
 
-
-from IPython.display import display
-
-
-from pyspark.sql.types import *
+from pyspark.sql.types import IntegerType, StringType, StructType, StructField, TimestampType, FloatType, ArrayType, MapType
 import pyspark.sql.functions as F
-from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT
+from pyspark.ml.linalg import SparseVector, VectorUDT
 
 from pyspark.context import SparkContext, SparkConf
 from pyspark.sql.session import SparkSession
@@ -38,17 +34,10 @@ sc = SparkContext(conf=conf)
 spark = SparkSession(sc)
 
 import numpy as np
-import scipy.sparse
 
 import math
 import datetime
 import time
-import itertools
-
-import pickle
-
-import random
-random.seed(42)
 
 import pandas as pd
 
@@ -461,7 +450,7 @@ else:
 
 # # Training models
 def is_null(value):
-    return value == None or len(str(value).strip()) == 0
+    return value is None or len(str(value).strip()) == 0
 
 LESS_SPECIAL_CAT_VALUE = 'less'
 def get_category_field_values_counts(field, df, min_threshold=10):
@@ -490,7 +479,7 @@ len(doc_entity_id_values_counts)
 
 # ## Processing average CTR by categories
 def get_percentiles(df, field, quantiles_levels=None, max_error_rate=0.0):
-    if quantiles_levels == None:
+    if quantiles_levels is None:
         quantiles_levels = np.arange(0.0, 1.1, 0.1).tolist() 
     quantiles = df.approxQuantile(field, quantiles_levels, max_error_rate)
     return dict(zip(quantiles_levels, quantiles))
@@ -896,7 +885,7 @@ def get_days_diff(newer_timestamp, older_timestamp):
     return days_diff
 
 def get_time_decay_factor(timestamp, timestamp_ref=None, alpha=0.001):
-    if timestamp_ref == None:
+    if timestamp_ref is None:
         timestamp_ref = time.time()
         
     days_diff = get_days_diff(timestamp_ref, timestamp)
@@ -1146,7 +1135,7 @@ def cosine_similarity_dicts(dict1, dict2):
     return sum_common_aspects / (dict1_norm * dict2_norm), intersections
 
 def cosine_similarity_user_docs_aspects(user_aspect_profile, doc_aspect_ids, doc_aspects_confidence, aspect_docs_counts):
-    if user_aspect_profile==None or len(user_aspect_profile) == 0 or doc_aspect_ids == None or len(doc_aspect_ids) == 0:
+    if user_aspect_profile is None or len(user_aspect_profile) == 0 or doc_aspect_ids is None or len(doc_aspect_ids) == 0:
         return None, None
         
     doc_aspects = dict(zip(doc_aspect_ids, doc_aspects_confidence))
@@ -1170,7 +1159,6 @@ def cosine_similarity_user_docs_aspects(user_aspect_profile, doc_aspect_ids, doc
         random_error = math.pow(len(doc_aspects) / float(len(aspect_docs_counts)), 
           intersections) * math.pow(len(user_aspect_profile) / float(len(aspect_docs_counts)), 
           intersections)
-        confidence = 1.0 - random_error
     else:
         #P(A not intersect B) = 1 - P(A intersect B)
         random_error = 1 - ((len(doc_aspects) / float(len(aspect_docs_counts))) * 
@@ -1183,8 +1171,8 @@ def cosine_similarity_user_docs_aspects(user_aspect_profile, doc_aspect_ids, doc
 def cosine_similarity_doc_event_doc_ad_aspects(doc_event_aspect_ids, doc_event_aspects_confidence, 
         doc_ad_aspect_ids, doc_ad_aspects_confidence, 
         aspect_docs_counts):
-    if doc_event_aspect_ids == None or len(doc_event_aspect_ids) == 0 \
-            or doc_ad_aspect_ids == None or len(doc_ad_aspect_ids) == 0:
+    if doc_event_aspect_ids is None or len(doc_event_aspect_ids) == 0 \
+            or doc_ad_aspect_ids is None or len(doc_ad_aspect_ids) == 0:
         return None, None
         
     doc_event_aspects = dict(zip(doc_event_aspect_ids, doc_event_aspects_confidence))
@@ -1210,7 +1198,6 @@ def cosine_similarity_doc_event_doc_ad_aspects(doc_event_aspect_ids, doc_event_a
         random_error = math.pow(len(doc_event_aspect_ids) / float(len(aspect_docs_counts)), 
             intersections) * math.pow(len(doc_ad_aspect_ids) / float(len(aspect_docs_counts)), 
             intersections)
-        confidence = 1.0 - random_error
     else:
         #P(A not intersect B) = 1 - P(A intersect B)
         random_error = 1 - ((len(doc_event_aspect_ids) / float(len(aspect_docs_counts))) * 

+ 0 - 2
TensorFlow/Recommendation/WideAndDeep/trainer/task.py

@@ -20,8 +20,6 @@ import numpy as np
 import argparse
 import json
 import os
-import sys
-import pickle
 import tensorflow as tf
 import tensorflow_transform as tft
 

+ 0 - 6
TensorFlow/Recommendation/WideAndDeep/utils/hooks/training_hooks.py

@@ -15,12 +15,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import time
-import tensorflow as tf
-
-import dllogger
-
-
 class MeanAccumulator:
     def __init__(self):
         self.sum = 0