Group3d Checkpoint.ipynb

of 38
All materials on our website are shared by users. If you have any questions about copyright issues, please report us to resolve them. We are always happy to assist you.
  Jupyter Notebook Viewer[22-09-2016 23:38:47] chadslab / kaggle / redhat / .ipynb_checkpoints In [1]:In [2]: import   copyimport   datetimeimport   gzipimport    picklefrom    itertools   import  product import   timeimport    pandas   as    pd import   numpy   as   npfrom    scipy   import  interpolate ## For other interpolation functions. import   sklearn.metricsimport   sklearn.utilsimport   sklearn.linear_modelfrom    sklearn.cross_validation   import  LabelKFold from    sklearn.metrics   import  roc_auc_score import   xgboost   as   xgb %  matplotlib  inline import    matplotlib.pyplot   as    pltwith'merged7.pkl.gz', 'rb') as  fd: data = pickle.load(fd) with'cvleak7-10fold.pkl.gz', 'rb') as  fd: cvleak = pickle.load(fd)  data = pd.merge(data, cvleak, on='activity_id', how='left') with'dproc7.pkl.gz', 'rb') as  fd: extra = pickle.load(fd) data = pd.merge(data, extra, on='activity_id', how='left') if   True : # Wasteful to recompute a constant every time  mindate = pd. Timestamp('2022-07-17 00:00:00')  maxdate = pd. Timestamp('2023-08-31 00:00:00') /opt/conda/lib/python3.5/site-packages/sklearn/ DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different fromthat of this module. This module will be removed in 0.20.  This module will be removed in 0.20. , DeprecationWarning)       Jupyter Notebook Viewer[22-09-2016 23:38:47] In [3]:In [4]:In [5]:In [6]:In [7]:In [8]:In [9]:In [10]:In [ ]:In [11]: minpdate = pd. Timestamp('2020-05-18 00:00:00') else : mindate = min(data['date'])  maxdate = max(data['date'])  minpdate = min(data['pdate'])  data.outcome.mean() # delete 17304, it's big and all zeros and may mess up models! data = data[data.group_1 != 17304] data.outcome.mean() # yes, removing that one group skews the outcome THAT much. cols = data.columns.copy() cols = cols.drop('activity_id') data_dups = data.duplicated(subset=cols) data_dedup = data[~data_dups] data_dup = data[data_dups]len(data_dedup), len(data) data_dedup.outcome.mean(), data_dup.outcome.mean() def  split_traintest(): testset = np.where(data['outcome'].isnull())  trainset = np.where(~data['outcome'].isnull())   return  trainset, testset, data.iloc[trainset], data.iloc[testset] trainset, testset, train, test = split_traintest() # skip this on a rerun (or group3+) def  build_group_df (df, classes =  True ): procs = {'group_1': [],  'num_people': [], 'num_events': [], 'pdate_mean': [], 'pdate_first': [], 'pdate_latest': [], 'adate_first': [],Out[3]:0.4439543965728709Out[5]:0.6976975552259174Out[8]:(1023194, 1731249)Out[9]:(0.7204231620668224, 0.6655633998573868)  Jupyter Notebook Viewer[22-09-2016 23:38:47] In [12]:In [14]: 'adate_latest': [], 'num_adates': [], 'otype': [], } group_class = []  for  g in  df .groupby('group_1'):  procs['group_1'].append(g[0])  procs['pdate_mean'].append(g[1].pdate_daynum.mean())  procs['pdate_first'].append(g[1].pdate_daynum.min())  procs['pdate_latest'].append(g[1].pdate_daynum.max())  procs['adate_first'].append(g[1].adate_daynum.min())  procs['adate_latest'].append(g[1].adate_daynum.max())  procs['num_people'].append(len(g[1].people_id.unique()))  procs['num_events'].append(len(g[1]))  procs['num_adates'].append(len(g[1].adate_daynum.unique()))   if   False : m= g[1].outcome.mean()   if  m== 0:  procs['otype'].append(0)   elif  m== 1:  procs['otype'].append(1)   else : procs['otype'].append(2) # mixed    else : procs['otype'].append(-1)  df_procs = pd.DataFrame(procs)   return  df_procsdf_groups = build_group_df(data_dedup) def  build_weight_oh(df, key): weighted = [] numv = df[key].max() + 1   for  g in  df .groupby('group_1'):  oh = np.zeros(numv, dtype=np.float64)  vc = g[1][key].value_counts()  wtot = 0   for  z in  zip(vc.index.values, vc.values):  oh[z[0]] += z[1]  weighted.append(np.hstack([[g[0]], oh / len(g[1])]))  df_wo = pd.DataFrame(weighted)  wo_cols = ['group_1']  Jupyter Notebook Viewer[22-09-2016 23:38:47] In [22]:In [24]:  for  i in  range(numv): wo_cols.append('  {0}  _group_onehotavg_  {1} '.format(key, i))  #print(wo_cols[-1])  df_wo.columns = wo_cols   #data = pd.merge(data, df_w2o, on='group_1', how='left')  droplist = []  for  c in  df_wo.keys():  #print(c, df_w2o[c].mean())   if  df_wo[c].mean() < .001:  droplist.append(c)  for  d in  droplist: df_wo.drop(d, axis=1, inplace= True )  return  df_wooh_keys = []alsouse = ['activity_category'] for  k in  data.keys():  if  'achar_10'  in  k:  continue   if  'pchar_38'  in  k:  continue   if  k in  alsouse or  'char'  in  k: oh_keys.append(k) for  k in  oh_keys: print(k) df_oh = build_weight_oh(data_dedup, k) df_groups = pd.merge(df_groups, df_oh, on='group_1', how='left') activity_categoryachar_1achar_2achar_3achar_4achar_5achar_6achar_7achar_8achar_9pchar_10pchar_11pchar_12pchar_13pchar_14pchar_15pchar_16pchar_17pchar_18pchar_19
We Need Your Support
Thank you for visiting our website and your interest in our free products and services. We are nonprofit website to share and download documents. To the running of this website, we need your help to support us.

Thanks to everyone for your continued support.

No, Thanks

We need your sign to support Project to invent "SMART AND CONTROLLABLE REFLECTIVE BALLOONS" to cover the Sun and Save Our Earth.

More details...

Sign Now!

We are very appreciated for your Prompt Action!