2017-03-27 1 views
-2

Kaggle에서 스크립트를 포크하려고하지만 그 오류 기능을 반복 할 수 없습니다. 날짜를 문자열로 변환하려고했지만 작동하지 않습니다.기능 반복 가능하지 않음 : Kaggle Fork

https://www.kaggle.com/bitsofbits/predict-west-nile-virus/simple-lasagne-nn

오류 메시지 : 또한

net, mean, std = train() 
    File "../src/script.py", line 270, in train 
    X = assemble_X(training, weather) 
    File "../src/script.py", line 225, in assemble_X 
    for b in base: 
TypeError: 'function' object is not iterable 

, 난 변환 사전에 dataframe하지만 그다지 도움이

결과를 볼 수있는 포크 스크립트에 아래 코드를 붙여 넣으세요 :

# -*- coding: utf-8 -*- 
""" 
Created on Mon Mar 27 00:55:09 2017 

""" 
from __future__ import print_function 
from collections import defaultdict 
import numpy as np 
import datetime 
import csv 
from operator import itemgetter 
import sys 
import pandas as pd 
from sklearn import ensemble, preprocessing 
import xgboost as xgb 
from sklearn.tree import DecisionTreeClassifier 
import math 
from lasagne.layers import InputLayer, DropoutLayer, DenseLayer 
from lasagne.updates import nesterov_momentum 
from lasagne.objectives import binary_crossentropy 
from nolearn.lasagne import NeuralNet 
import theano 
from theano import tensor as T 
from theano.tensor.nnet import sigmoid 
from sklearn import metrics 
from sklearn.utils import shuffle 
from sklearn.cross_validation import train_test_split 
import datetime 

# Load dataset 
train = pd.read_csv("../input/train.csv") 
test = pd.read_csv('../input/test.csv') 

def date(text): 
    return datetime.datetime.strptime(text, "%Y-%m-%d").date() 

#Converting to Date Object 
def getdate(datacol): 
    return pd.to_datetime(datacol, format="%Y-%m-%d") 

#Getting the Year 
def getyear(datacol): 
    return datacol['Date1'].dt.year 

#Getting day of Year 
def getdayofyear(datacol): 
    return datacol['Date1'].dt.dayofyear 

#Getting the Duplicated Values 
def Duplicatedfeature(datacol): 
    datacol['Freqcount'] = datacol.groupby(['Trap','year','day_of_year','Latitude','Longitude'])['Species'].transform(pd.Series.value_counts) 
    return datacol['Freqcount'] 

#Getting Freq by Traps 
def Duplicatedtraps(datacol): 
    datacol['Freqcounttraps'] = datacol.groupby(['Trap','year'])['day_of_year'].transform(pd.Series.value_counts) 
    return datacol['Freqcounttraps'] 

#Get Frequency of the Count of traps 
def trapsfrequency(datacol): 
    datacol['trapsfrequency'] = datacol.groupby(['Trap','year','day_of_year'])['Freqcounttraps'].transform(pd.Series.value_counts) 
    return datacol['trapsfrequency'] 

#Create New feature for count of traps 
def trapsfrequencyequal(datacol,number): 
    datacol['trapsfrequencyequal_'+str(number)] = np.where(datacol['Freqcounttraps']==number,datacol['trapsfrequency'], 0) 
    return datacol['trapsfrequencyequal_'+str(number)] 

#Create New feature for count of rows of traps atleast 2 
def trapsfrequencygreater(datacol,number): 
    datacol['trapsfrequencygreater_'+str(number)] = np.where(datacol['Freqcounttraps']>number,datacol['trapsfrequency'], 0) 
    return datacol['trapsfrequencygreater_'+str(number)] 

##Get count by currendate and check for atleast 2 
def trapsfrequencybycurrentdate(datacol): 
    datacol['trapsfrequencycurrentdate'] = datacol.groupby(['Date'])['Trap'].transform(pd.Series.value_counts) 
    return datacol['trapsfrequencycurrentdate'] 

def trapsfrequencycurrentdategreater1(datacol): 
    datacol['trapsfrequencycurrentdategreater1'] = np.where(datacol['trapsfrequencycurrentdate']>1,datacol['trapsfrequencycurrentdate'], 0) 
    return datacol['trapsfrequencycurrentdategreater1'] 

def Preprocessing(train): 
    train['Date1']=getdate(train['Date']) 
    train['year']=getyear(train).astype('int64') 
    train['day_of_year']=getdayofyear(train).astype('int64') 
    train['Freqcount']=Duplicatedfeature(train).astype('int64') 
    train['Freqcounttraps']=Duplicatedtraps(train).astype('int64') 
    train['trapsfrequency']=trapsfrequency(train).astype('int64') 
    trapsfrequencyequal(train,2).astype('int64') 
    trapsfrequencyequal(train,3).astype('int64') 
    trapsfrequencyequal(train,4).astype('int64') 
    trapsfrequencyequal(train,5).astype('int64') 
    trapsfrequencyequal(train,6).astype('int64') 
    train['sumoftrapsfrequencyequal']=train.trapsfrequencyequal_2+train.trapsfrequencyequal_3+train.trapsfrequencyequal_4+train.trapsfrequencyequal_5+train.trapsfrequencyequal_6 
    trapsfrequencygreater(train,1).astype('int64') 
    trapsfrequencybycurrentdate(train).astype('int64') 
    trapsfrequencycurrentdategreater1(train).astype('int64') 
    train.drop(['Date1','Address','AddressNumberAndStreet'], inplace=True, axis=1) 
    #train['AddressNumberandStreet'].apply(int) 
    train['trapsfrequencycurrentdate']=train['trapsfrequencycurrentdate'].astype('int64') 
    train['trapsfrequencycurrentdategreater1']=train['trapsfrequencycurrentdategreater1'].astype('int64') 
    return train 


train=Preprocessing(train) 
test=Preprocessing(test) 

# Convert categorical data to numbers 
lbl = preprocessing.LabelEncoder() 
lbl.fit(list(train['Species'].values) + list(test['Species'].values)) 
train['Species'] = lbl.transform(train['Species'].values) 
test['Species'] = lbl.transform(test['Species'].values) 

lbl.fit(list(train['Street'].values) + list(test['Street'].values)) 
train['Street'] = lbl.transform(train['Street'].values) 
test['Street'] = lbl.transform(test['Street'].values) 

lbl.fit(list(train['Trap'].values) + list(test['Trap'].values)) 
train['Trap'] = lbl.transform(train['Trap'].values) 
test['Trap'] = lbl.transform(test['Trap'].values) 

train.info() 

train=train.to_dict() 
test=test.to_dict() 


def precip(text): 
    TRACE = 1e-3 
    text = text.strip() 
    if text == "M": 
     return None 
    if text == "-": 
     return None 
    if text == "T": 
     return TRACE 
    return float(text) 

def impute_missing_weather_station_values(weather): 
    # Stupid simple 
    for k, v in weather.items(): 
     if v[0] is None: 
      v[0] = v[1] 
     elif v[1] is None: 
      v[1] = v[0] 
     for k1 in v[0]: 
      if v[0][k1] is None: 
       v[0][k1] = v[1][k1] 
     for k1 in v[1]: 
      if v[1][k1] is None: 
       v[1][k1] = v[0][k1] 

def load_weather(): 
    weather = {} 
    for line in csv.DictReader(open("../input/weather.csv")): 
     for name, converter in {"Date" : date, 
           "Tmax" : float,"Tmin" : float,"Tavg" : float, 
           "DewPoint" : float, "WetBulb" : float, 
           "PrecipTotal" : precip, 
           "Depart" : float, 
           "ResultSpeed" : float,"ResultDir" : float,"AvgSpeed" : float, 
           "StnPressure" : float, "SeaLevel" : float}.items(): 
      x = line[name].strip() 
      line[name] = converter(x) if (x != "M") else None 
     station = int(line["Station"]) - 1 
     assert station in [0,1] 
     dt = line["Date"] 
     if dt not in weather: 
      weather[dt] = [None, None] 
     assert weather[dt][station] is None, "duplicate weather reading {0}:{1}".format(dt, station) 
     weather[dt][station] = line 
    impute_missing_weather_station_values(weather)   
    return weather 


def load_training(): 
    training = train 
    #for index,r in train: 
    # training.append((r['Date'],r['Latitude'],r['Species'],r['Trap'],r['Latitude'],r['Longitude'],r['NumMosquitos'],r['year'],r['WnvPresent'],r['day_of_year'],r['Freqcount'],r['Freqcount'],r['Freqcounttraps'],r['trapsfrequency'],r['trapsfrequencyequal_2'],r['trapsfrequencyequal_3'],r['trapsfrequencyequal_4'],r['trapsfrequencyequal_5'],r['trapsfrequencyequal_6'],r['sumoftrapsfrequencyequal'],r['trapsfrequencygreater_1'],r['trapsfrequencycurrentdategreater1'])) 
    return training 

def load_testing(): 
    training = test 
    #for line in csv.DictReader(open("../input/test.csv")): 
    # for name, converter in {"Date" : datetime.date, 
    #       "Latitude" : float, "Longitude" : float}.items(): 
    #  line[name] = converter(line[name]) 
    # training.append(line) 
    return training 


def closest_station(lat, long): 
    # Chicago is small enough that we can treat coordinates as rectangular. 
    stations = np.array([[41.995, -87.933], 
         [41.786, -87.752]]) 
    loc = np.array([lat, long]) 
    deltas = stations - loc[None, :] 
    dist2 = (deltas**2).sum(1) 
    return np.argmin(dist2) 

def normalize(X, mean=None, std=None): 
    count = X.shape[1] 
    if mean is None: 
     mean = np.nanmean(X, axis=0) 
    for i in range(count): 
     X[np.isnan(X[:,i]), i] = mean[i] 
    if std is None: 
     std = np.std(X, axis=0) 
    for i in range(count): 
     X[:,i] = (X[:,i] - mean[i])/std[i] 
    return mean, std 

def scaled_count(record): 
    SCALE = 10.0 
    if "NumMosquitos" not in record: 
     # This is test data 
     return 1 
    return int(np.ceil(record["NumMosquitos"]/SCALE)) 


def assemble_X(base, weather): 
    X = [] 
    for b in base: 
     date = b["Date"] 
     lat, long = b["Latitude"], b["Longitude"] 
     case = [date.year, date.month, date.day, lat, long] 
     # Look at a selection of past weather values 
     for days_ago in [1,3,7,14]: 
      day = date - datetime.timedelta(days=days_ago) 
      for obs in ["Tmax","Tmin","Tavg","DewPoint","WetBulb","PrecipTotal","Depart"]: 
       station = closest_station(lat, long) 
       case.append(weather[day][station][obs]) 
     # Specify which mosquitos are present 
     species_vector = [float(x) for x in species_map[b["Species"]]] 
     case.extend(species_vector) 
     # Weight each observation by the number of mosquitos seen. Test data 
     # Doesn't have this column, so in that case use 1. This accidentally 
     # Takes into account multiple entries that result from >50 mosquitos 
     # on one day. 
     for repeat in range(scaled_count(b)): 
      X.append(case)  
    X = np.asarray(X, dtype=np.float32) 
    return X 

def assemble_y(base): 
    y = [] 
    for b in base: 
     present = b["WnvPresent"] 
     for repeat in range(scaled_count(b)): 
      y.append(present)  
    return np.asarray(y, dtype=np.int32).reshape(-1,1) 


class AdjustVariable(object): 
    def __init__(self, variable, target, half_life=20): 
     self.variable = variable 
     self.target = target 
     self.half_life = half_life 
    def __call__(self, nn, train_history): 
     delta = self.variable.get_value() - self.target 
     delta /= 2**(1.0/self.half_life) 
     self.variable.set_value(np.float32(self.target + delta)) 

def train(): 
    weather = load_weather() 
    training = load_training() 

    X = assemble_X(training, weather) 
    mean, std = normalize(X) 
    y = assemble_y(training) 

    input_size = len(X[0]) 

    learning_rate = theano.shared(np.float32(0.1)) 

    net = NeuralNet(
    layers=[ 
     ('input', InputLayer), 
     ('hidden1', DenseLayer), 
     ('dropout1', DropoutLayer), 
     ('hidden2', DenseLayer), 
     ('dropout2', DropoutLayer), 
     ('output', DenseLayer), 
     ], 
    # layer parameters: 
    input_shape=(None, input_size), 
    hidden1_num_units=256, 
    dropout1_p=0.4, 
    hidden2_num_units=256, 
    dropout2_p=0.4, 
    output_nonlinearity=sigmoid, 
    output_num_units=1, 

    # optimization method: 
    update=nesterov_momentum, 
    update_learning_rate=learning_rate, 
    update_momentum=0.9, 

    # Decay the learning rate 
    on_epoch_finished=[ 
      AdjustVariable(learning_rate, target=0, half_life=4), 
      ], 

    # This is silly, but we don't want a stratified K-Fold here 
    # To compensate we need to pass in the y_tensor_type and the loss. 
    regression=True, 
    y_tensor_type = T.imatrix, 
    objective_loss_function = binary_crossentropy, 

    max_epochs=32, 
    eval_size=0.1, 
    verbose=1, 
    ) 

    X, y = shuffle(X, y, random_state=123) 
    net.fit(X, y) 

    _, X_valid, _, y_valid = train_test_split(X, y) 
    probas = net.predict_proba(X_valid)[:,0] 
    print("ROC score", metrics.roc_auc_score(y_valid, probas)) 

    return net, mean, std  


def submit(net, mean, std): 
    weather = load_weather() 
    testing = load_testing() 
    X = assemble_X(testing, weather) 
    normalize(X, mean, std) 
    predictions = net.predict_proba(X)[:,0]  
    # 
    out = csv.writer(open("west_nile.csv", "w")) 
    out.writerow(["Id","WnvPresent"]) 
    for row, p in zip(testing, predictions): 
     out.writerow([row["Id"], p]) 


if __name__ == "__main__": 
    net, mean, std = train() 
    submit(net, mean, std) 
+0

스택 오버플로에 오신 것을 환영합니다. 좋은 질문을하는 방법을 알아 보려면 도움말 센터를 방문하십시오. 그것은 많은 코드를 버리고 오류 메시지를 게시하지 않고 다시 추적하며 "작동하지 않는다"는 것을 의미하지 않습니다. 그 중 어떤 것도 잘 묻는 질문을위한 재료가 아닙니다. –

+0

Thanks Steven ... 오류 메시지를 추가했습니다. –

답변

0

아마도 N assemble_X의 aN 값은 다음과 같이 처리하려고 시도합니다.

for b in base: 
    if isinstance(b): 
관련 문제