import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

# These are for the machine learning
import matplotlib as matlib
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

!pip3 install lxml

Requirement already satisfied: lxml in /home/ibutz/.local/lib/python3.8/site-packages (4.6.3)


# Start by reading in from the csv we manually created
pastPicks_df = pd.read_csv('past_college_players.csv', '\t')
pastPicks_df.head()


# Create a 2d array that has 12 arrays for our 12 categories
data = [[], [], [], [], [], [], [], [], [], [], [], []]
for i, r in pastPicks_df.iterrows():
    data[0].append(i[1])
    data[1].append(i[2])
    data[2].append(r['RK'][slice(0, len(r['RK']) - 5)])
    data[3].append(r['TEAM'])
    data[4].append(r['CONF'])
    data[5].append(r['MIN%'])
    data[6].append(r['OR'])
    data[7].append(r['DR'])
    data[8].append(r['AST'])
    data[9].append(r['BLK'])
    data[10].append(r['FTR'])
    data[11].append(r['3P'])

headers = ['pick', 'class', 'player', 'minutes_per', 'off_reb_per',
           'def_reb_per', 'assist_per', 'block_per', 'steal_per', 'free_throw_per',
           'two_point_per', 'three_point_per']

# pandas reads in row wise, so I use T to take the transpose
pastPicks_df = pd.DataFrame(np.array(data).T, columns=headers)

# Outputting this to a csv for data_training
pastPicks_df.to_csv('allyears.csv')

# Next, only take a players final year in the NCAA. 
# Thankfully, the way the data is formatted all I have to do is get last occurance of each player
pastPicks_df = pastPicks_df.drop_duplicates(subset='player', keep='last', ignore_index=True)

# Lastly, I want to make sure we have our columns as numeric data
for c in pastPicks_df.columns:
    if c != 'class' and c != 'player':
        pastPicks_df[c] = pd.to_numeric(pastPicks_df[c])

# Push it to csv for later use as well
pastPicks_df.to_csv('cleaned_train.csv')

pastPicks_df.head()


# HTML Scraping the list of early draft commits using requests, BeautifulSoup, and pandas
r = requests.get("https://ca.nba.com/news/2021-nba-draft-ncaa-\
                 players-that-have-declared-for-the-2021-nba-draft-e/hphzsl6bo6dh16t9tpee5628y")
soup = BeautifulSoup(r.content, 'lxml')

# Find the table in the html, thankfully only one
html_table = soup.find("table")
earlycomm_df = pd.read_html(str(html_table), header=0)[0]
earlycomm_df.head()


# Moving onto the 2021 season data, we read in our player data from the csv 
stats2021_df = pd.read_csv('2021season.csv', '\t')
stats2021_df.head()


# Create a 2d array that has 11 arrays for our 11 categories
data = [[], [], [], [], [], [], [], [], [], [], []]
for i, r in stats2021_df.iterrows():
    data[0].append(i[1])
    data[1].append(i[3])
    data[2].append(r['TEAM'])
    data[3].append(r['CONF'])
    data[4].append(r['MIN%'])
    data[5].append(r['OR'])
    data[6].append(r['DR'])
    data[7].append(r['AST'])
    data[8].append(r['BLK'])
    data[9].append(r['FTR'])
    data[10].append(r['3P'])

# pandas reads in row wise, so I use T to take the transpose
temp = pd.DataFrame(np.array(data).T, columns=headers[1:])

# Cross reference to get players who have declared for the draft
temp = temp.where(temp['player'].isin(earlycomm_df['Player']))
stats2021_df = temp.dropna(how='all').reset_index(drop=True)

# Once again, I want to make sure we have numerical data
for c in stats2021_df.columns:
    if c != 'class' and c != 'player':
        stats2021_df[c] = pd.to_numeric(stats2021_df[c])


# push to csv for later
stats2021_df.to_csv('cleaned2021.csv')

stats2021_df.head()


# Set up for this plot
classes = ['Fr', 'So', 'Jr', 'Sr']
colors = ['red', 'yellow', 'blue', 'green']
by_age = []

# Use a list comprehension 4 times
for c in classes:
    # Get 4 lists, each one contains draft picks from that class
    by_age.append([r['pick'] for i, r in pastPicks_df.iterrows() if r['class'] == c])


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))

# Plotting by class
parts1 = ax1.violinplot(by_age, range(len(classes)), showmeans=True)

# Plotting Pick over minutes played.
parts2 = ax2.scatter(pastPicks_df['minutes_per'], pastPicks_df['pick'])

# Setting plot colors
for i, pc in enumerate(parts1['bodies']):
    pc.set_facecolor(colors[i])

# Titles/labels
plt.suptitle("Draft Pick Distributions", fontsize=25)
ax1.set_title("Distributed by Class", fontsize=20)
ax1.set_xlabel('Class', fontsize=15)
ax1.set_ylabel('Pick Number', fontsize=15)

ax2.set_title("Distributed by Minutes", fontsize=20)
ax2.set_xlabel('Percent of game played', fontsize=15)
ax2.set_ylabel('Pick Number', fontsize=15)

plt.sca(ax1)
plt.xticks(range(len(classes)), classes, fontsize='x-large')

plt.show()


fig, axs = plt.subplots(2, 4, figsize=(12,12))
titles = pastPicks_df.columns[4:]

for i, row in enumerate(axs):
    for j, plot in enumerate(row):
        title = titles[4*i + j]
        plot.scatter(pastPicks_df['pick'], pastPicks_df[title])
        plot.set_xlabel("Pick number")
        plot.set_ylabel(title)
        plot.set_title(title + " over pick number")
        
        
fig.tight_layout()
plt.show()


biometric_df = pd.read_csv("nba_draft_combine_all_years.csv", index_col=0)
biometric_df.head()


pastPicks_df = pastPicks_df.join(biometric_df)
# Dropping duplicate columns
pastPicks_df.drop(['Player', 'Year', 'Draft pick'], axis=1, inplace=True)
pastPicks_df.head()


pastPicks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 621 entries, 0 to 620
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pick                      621 non-null    int64  
 1   class                     621 non-null    object 
 2   player                    621 non-null    object 
 3   minutes_per               621 non-null    float64
 4   off_reb_per               621 non-null    float64
 5   def_reb_per               621 non-null    float64
 6   assist_per                621 non-null    float64
 7   block_per                 621 non-null    float64
 8   steal_per                 621 non-null    float64
 9   free_throw_per            621 non-null    float64
 10  two_point_per             621 non-null    float64
 11  three_point_per           621 non-null    float64
 12  Height (No Shoes)         517 non-null    float64
 13  Height (With Shoes)       516 non-null    float64
 14  Wingspan                  517 non-null    float64
 15  Standing reach            517 non-null    float64
 16  Vertical (Max)            450 non-null    float64
 17  Vertical (Max Reach)      450 non-null    float64
 18  Vertical (No Step)        450 non-null    float64
 19  Vertical (No Step Reach)  450 non-null    float64
 20  Weight                    516 non-null    float64
 21  Body Fat                  514 non-null    float64
 22  Hand (Length)             470 non-null    float64
 23  Hand (Width)              468 non-null    float64
 24  Bench                     284 non-null    float64
 25  Agility                   444 non-null    float64
 26  Sprint                    446 non-null    float64
dtypes: float64(24), int64(1), object(2)
memory usage: 131.1+ KB


# See the correlation between each biometric and the pick of the player in a scatter plot
metrics , i = [], 0
# fig, axs = plt.subplots(2, 4, figsize=(12,12))
for col in biometric_df.columns:
    if i > 2:
        metrics.append(col)
    i += 1
    
pick = biometric_df['Draft pick']
fig, axs = plt.subplots(3, 5, figsize=(12,12))

count = 0
for i, row in enumerate(axs):
    for j, plot in enumerate(row):
        plot.scatter(pick, biometric_df[metrics[count]])
#         fit = np.polyfit(biometric_df[metrics[count]], pick, deg=1)
#         plot.plot(biometric_df[metrics[count]], fit[0] * np.array(biometric_df[metrics[count]]) + fit[1], color='r')
        plot.set_xlabel("Pick number")
        plot.set_ylabel(metrics[count])
        plot.set_title(metrics[count] + " vs. pick")
        count += 1
        
        
fig.tight_layout()
plt.show()


train = pd.read_csv("allyears.csv") 
train.head()


test = pd.read_csv("cleaned2021.csv") 
test.head()


# Convert the classes to numbers for the train dataset
# Senior gets 4, junior gets 3, sophmore gets 2, freshman gets 1
classes = []
for row in train.iterrows():
    classification = row[1][2]
    if classification == 'Sr':
        classes.append(4)
    elif classification == 'Fr':
        classes.append(1)
    elif classification == 'Jr':
        classes.append(3)
    else:
        classes.append(2)
        
train['class1'] = classes
train.head()


# Convert the classes to numbers for the test dataset
classes = []
for row in test.iterrows():
    classification = row[1][2]
    if classification == 'Sr':
        classes.append(4)
    elif classification == 'Fr':
        classes.append(1)
    elif classification == 'Jr':
        classes.append(3)
    else:
        classes.append(2)
        
test['class1'] = classes
test.head()


# Collect the actual prediction output and the feature input for the train dataset
train_input, train_actual = [],[]
for row in train.iterrows():
    train_actual.append(row[1][1])
        
        
    train_row = [row[1][4], row[1][5], row[1][6], row[1][7], row[1][8], row[1][9],
                row[1][10], row[1][11], row[1][12], row[1][13]]
    
    train_input.append(train_row)


# Collect the actual prediction output and the feature input for the test dataset
test_input = []
for row in test.iterrows():
    test_row = [row[1][3], row[1][4], row[1][5], row[1][6], row[1][7], row[1][8], row[1][9],
                row[1][10], row[1][11], row[1][12]]
    
    test_input.append(test_row)


# Plot the KMeans clustering for the train data using principle component analysis
pca = PCA(2)
df = pca.fit_transform(train_input)


# Generate random colors later clustering plot
colors = ['orange', 'm', 'saddlebrown', 'crimson', 'gold', 'lightcoral']
kmeans = KMeans(n_clusters=3, random_state=0, max_iter=1).fit(df)
label = kmeans.labels_
centroids = kmeans.cluster_centers_


i = 0
plt.title("KMeans clustering for the training data set 3 clusters")
while i < 3:
    plt.scatter(df[label == i , 0] , df[label == i , 1] , s=1, label = i, color = colors[i])
    i += 1

plt.scatter(centroids[:,0] , centroids[:,1] , s = 10, color = 'k')
plt.show()


kmeans = KMeans(n_clusters=2, random_state=0, max_iter=1).fit(df)
label = kmeans.labels_
centroids = kmeans.cluster_centers_
i = 0
plt.title("KMeans clustering for the training data set 2 clusters")
while i < 3:
    plt.scatter(df[label == i , 0] , df[label == i , 1] , s=1, label = i, color = colors[i])
    i += 1


plt.scatter(centroids[:,0] , centroids[:,1] , s = 10, color = 'k')
plt.show()


# This list stores the accuracy rate for each model
acc = dict()

def test_acc(train_actual, train_input, model):
    y_pred = model.predict(train_input)
    correct, i, length = 0, 0, len(train_actual)

    while i < length:
        if train_actual[i] == y_pred[i]:
            correct += 1
        i += 1

    print(correct/length)
    return correct/length


# Use k nearest neighbors with 3 neighbors
model = KNeighborsClassifier(n_neighbors=3)
model.fit(train_input, train_actual)
# Test the accuracy of this model
acc['KNN3'] = test_acc(train_actual, train_input, model)

0.38632872503840243


# Use k nearest neighbors with 2 neighbors
model = KNeighborsClassifier(n_neighbors=2)
model.fit(train_input, train_actual)
# Test the accuracy of this model
acc['KNN2'] = test_acc(train_actual, train_input, model)

0.5314900153609831


# Use k nearest neighbors with 1 neighbors
model = KNeighborsClassifier(n_neighbors=1)
model.fit(train_input, train_actual)
# Test the accuracy of this model
acc['KNN1'] = test_acc(train_actual, train_input, model)

1.0


# use linear support vector machine
model = svm.SVC(kernel='linear')
model.fit(train_input, train_actual, sample_weight=None)
acc['SVML'] = test_acc(train_actual, train_input, model)

0.15668202764976957


# use radial basis support vector machine
model = svm.SVC(kernel='rbf')
model.fit(train_input, train_actual, sample_weight=None)
acc['SVMR'] = test_acc(train_actual, train_input, model)

0.05913978494623656


clf = MLPClassifier(random_state=1, max_iter=1500).fit(train_input, train_actual)
acc['MLP'] = test_acc(train_actual, train_input, model)

0.05913978494623656


# Use logistic regression to predict the data
model = LogisticRegression(solver = 'liblinear')
model.fit(train_input, train_actual, sample_weight=None)
y_pred = model.predict(train_input)
acc['LGL'] = test_acc(train_actual, train_input, model)

0.08602150537634409


model = LogisticRegression(solver = 'saga')
model.fit(train_input, train_actual, sample_weight=None)
y_pred = model.predict(train_input)
acc['LGS'] = test_acc(train_actual, train_input, model)

0.06912442396313365
/home/ibutz/.local/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:328: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn("The max_iter was reached which means "


df1 = pd.DataFrame(np.array([['KNN3', acc['KNN3']], ['KNN2', acc['KNN2']], ['KNN1', acc['KNN1']]
                        ,['SVM linear', acc['SVML']], ['SVM rbf', acc['SVMR']], ['MLP', acc['MLP']],
                            ['Logistic liblinear', acc['LGL']], ['Logistic saga', acc['LGS']]])
                   , columns=['Model', ' Accuracy'])

df1


# Use k nearest neighbors with 1 neighbors
model = KNeighborsClassifier(n_neighbors=1)
model.fit(train_input, train_actual)
y_pred1 = model.predict(test_input)
y_pred1

array([38, 50, 13, 47, 42, 33, 44, 59, 37, 28, 53, 36, 41,  8, 16, 14,  6,
       59, 11, 58, 17, 21, 50, 57, 58, 49, 29, 53, 36, 30, 38, 58, 33, 11,
       18, 53, 36, 28, 38, 41, 29, 58, 40, 27, 26,  3, 38, 26, 26, 19, 30,
       41, 40, 23, 55, 24, 41, 32, 33, 24, 37, 13, 44, 28,  6, 54, 34, 44,
       56, 47, 29, 31, 15, 50, 10, 56, 50, 33, 53])


# Use k nearest neighbors with 2 neighbors
model = KNeighborsClassifier(n_neighbors=2)
model.fit(train_input, train_actual)
y_pred2 = model.predict(test_input)
y_pred2

array([11,  7, 13, 44, 11, 33, 22, 34, 29, 28, 53, 36, 41,  8, 16, 14,  6,
       36, 11, 49, 17,  3,  7, 41,  1, 18, 29, 53, 26, 30, 38, 46,  6, 11,
       18, 41, 27, 19, 21, 29, 29, 30, 40, 27, 26,  3, 36, 26, 24, 13, 30,
       41, 21, 23, 18, 18, 41, 32, 29, 24, 37, 13, 34, 28,  6, 31, 24, 10,
       10, 44, 29, 31, 15, 31,  3, 11, 37, 28, 16])


pred_players_knn1, pred_players_knn2, i = dict(), dict(), 0
for row in test.iterrows():
    pred_players_knn1[row[1][2]] = y_pred1[i]
    pred_players_knn2[row[1][2]] = y_pred2[i]
    i += 1


# Predictions on players using KNN1. Just showing the top 10
sort_orders = sorted(pred_players_knn1.items(), key=lambda x: x[1])
sort_orders[:10]

[('Jose Alvarado', 3),
 ('Colin Castleton', 6),
 ('Quentin Grimes', 6),
 ('Carlik Jones', 8),
 ('Trendon Watford', 10),
 ('Damien Jefferson', 11),
 ('Isaiah Wong', 11),
 ('Alan Griffin', 13),
 ('Moussa Cisse', 13),
 ('Charles Bassey', 14)]


# Predictions on players with KNN2. Again, just the top 10
sort_orders = sorted(pred_players_knn2.items(), key=lambda x: x[1])
sort_orders[:10]

[("Day'Ron Sharpe", 1),
 ('David Duke', 3),
 ('Jose Alvarado', 3),
 ('Trendon Watford', 3),
 ('Colin Castleton', 6),
 ('Isaiah Jackson', 6),
 ('Quentin Grimes', 6),
 ('Aaron Wiggins', 7),
 ('David Johnson', 7),
 ('Carlik Jones', 8)]


df2 = pd.DataFrame(columns=['Player', 'Model KNN1 Pick'])
names = ['Jose Alvarado', 'Colin Castleton', 'Quentin Grimes', 'Carlik Jones', 'Trendon Watford']
model1 = [3, 6, 6, 8, 10]
model2 = [3, 6, 6, 8, 3]
df2['Player'] = names
df2['Model KNN1 Pick'] = model1
df2['Model KNN2 Pick'] = model2

df2

				RK	PICK	PLAYER	TEAM	CONF	MIN%	OR	DR	AST	BLK	STL	FTR	2P	3P	YEAR
1	46	Fr	7-0	A.J. Hammons (13)	Purdue	B10	57.4	11.6	17.3	6.2	8.7	0.9	41.0	140-283	0.495	0-0	0.000	2013
2	46	So	7-0	A.J. Hammons (14)	Purdue	B10	59.6	10.1	22.8	4.2	13.2	0.9	62.2	114-220	0.518	0-5	0.000	2014
3	46	Jr	7-0	A.J. Hammons (15)	Purdue	B10	60.1	12.2	19.7	9.1	12.3	1.3	46.7	157-284	0.553	0-7	0.000	2015
4	46	Sr	7-0	A.J. Hammons (16)	Purdue	B10	57.5	11.5	24.8	10.5	10.4	0.7	37.8	193-325	0.594	6-11	0.545	2016
5	52	Jr	6-2	A.J. Price (08)	Connecticut	BE	79.7	2.8	8.1	36.1	0.3	2.3	38.8	108-225	0.480	52-141	0.369	2008

	pick	class	player	minutes_per	off_reb_per	def_reb_per	assist_per	block_per	steal_per	free_throw_per	two_point_per	three_point_per
0	46	Sr	A.J. Hammons	57.5	11.5	24.8	10.5	10.4	0.7	37.8	0.594	0.545
1	52	Sr	A.J. Price	73.0	2.1	9.2	28.5	0.0	1.2	30.3	0.422	0.398
2	4	Fr	Aaron Gordon	77.1	10.4	19.3	13.0	3.4	1.8	47.1	0.513	0.356
3	23	Jr	Aaron Holiday	92.1	1.6	8.7	29.9	0.7	1.9	41.7	0.486	0.429
4	14	Fr	Aaron Nesmith	72.2	4.2	17.3	10.8	2.1	1.4	27.5	0.474	0.337

	Player	Position	Year	School
0	Ochai Agbaji	SG	Jr.	Kansas
1	James Akinjo	PG	Jr.	Arizona
2	Keve Aluma	PF	R-Jr.	Virginia Tech
3	Jose Alvarado	PG	Sr.	Georgia Tech
4	Avery Anderson III	PG	So.	Oklahoma State

				RK	PLAYER	TEAM	CONF	MIN%	OR	DR	AST	BLK	STL	FTR	2P	3P
1	Jr	6-5	A.J. Caldwell	Chattanooga	SC	80.2	1.6	14.9	13.7	1.3	2.1	9.2	15-41	0.366	36-90	0.400
2	Sr	6-5	A.J. Lawson	McNeese St.	Slnd	73.3	5.3	14.9	33.9	0.4	1.4	42.3	58-122	0.475	11-34	0.324
3	Jr	6-5	A.J. Oliver II	Old Dominion	CUSA	66.9	2.5	8.9	7.3	0.8	1.1	19.2	36-65	0.554	32-102	0.314
4	So	6-2	A.J. Plitzuweit	South Dakota	Sum	75.6	2.4	11.6	23.4	0.3	1.5	38.3	76-165	0.461	52-117	0.444
5	Jr	6-6	A.J. Reeves	Providence	BE	77.2	2.3	10.6	10.3	0.3	1.9	23.2	31-74	0.419	48-150	0.320

	class	player	minutes_per	off_reb_per	def_reb_per	assist_per	block_per	steal_per	free_throw_per	two_point_per	three_point_per
0	Jr	Aaron Henry	80.9	4.8	14.6	25.2	4.1	2.4	28.9	0.493	0.296
1	Jr	Aaron Wiggins	83.3	4.5	15.9	17.2	2.0	2.1	20.3	0.515	0.346
2	Jr	Alan Griffin	72.8	6.3	15.2	12.2	6.6	2.3	19.0	0.514	0.354
3	Sr	Alfonso Plummer	69.8	1.3	7.9	7.5	0.2	1.5	12.1	0.526	0.383
4	So	Armando Bacot	56.6	14.5	22.3	7.3	4.3	1.6	58.3	0.631	0.000

Predicting the NBA Draft¶

By Isaac Butz, Zongxia Li, and Laksh Sekhri¶

Part 0: Setup¶

Part 1: Data Wrangling¶

Step a: Previous NBA drafts¶

Step b: This year's draft¶

Part 2¶

Step a: Players stats¶

Step b: Getting more data¶

Step c: Further Analysis¶

Part 3¶

Step a: Training and Finding the Best Model¶

Collecting training data and testing data¶

K means clustering classification model¶

Accuracy checking function¶

K Nearest Neighbors Model prediction and accuracy checking¶

Support Vector Machine Model (SVM)¶

Multilayer Perceptron¶

Logistic Regression¶

Step b: Model comparisons and predictions.¶

Model Comparisons¶

Prediction on 2021 drafts¶

Conclusion¶

How to get a better model¶

	Player	Year	Draft pick	Height (No Shoes)	Height (With Shoes)	Wingspan	Standing reach	Vertical (Max)	Vertical (Max Reach)	Vertical (No Step)	Vertical (No Step Reach)	Weight	Body Fat	Hand (Length)	Hand (Width)	Bench	Agility	Sprint
0	Blake Griffin	2009	1.0	80.50	82.00	83.25	105.0	35.5	140.5	32.0	137.0	248.0	8.2	NaN	NaN	22.0	10.95	3.28
1	Terrence Williams	2009	11.0	77.00	78.25	81.00	103.5	37.0	140.5	30.5	134.0	213.0	5.1	NaN	NaN	9.0	11.15	3.18
2	Gerald Henderson	2009	12.0	76.00	77.00	82.25	102.5	35.0	137.5	31.5	134.0	215.0	4.4	NaN	NaN	8.0	11.17	3.14
3	Tyler Hansbrough	2009	13.0	80.25	81.50	83.50	106.0	34.0	140.0	27.5	133.5	234.0	8.5	NaN	NaN	18.0	11.12	3.27
4	Earl Clark	2009	14.0	80.50	82.25	86.50	109.5	33.0	142.5	28.5	138.0	228.0	5.2	NaN	NaN	5.0	11.17	3.35

	Model	Accuracy
0	KNN3	0.38632872503840243
1	KNN2	0.5314900153609831
2	KNN1	1.0
3	SVM linear	0.15668202764976957
4	SVM rbf	0.05913978494623656
5	MLP	0.05913978494623656
6	Logistic liblinear	0.08602150537634409
7	Logistic saga	0.06912442396313365

	Player	Model KNN1 Pick	Model KNN2 Pick
0	Jose Alvarado	3	3
1	Colin Castleton	6	6
2	Quentin Grimes	6	6
3	Carlik Jones	8	8
4	Trendon Watford	10	3