import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import linear_model

pd.set_option('display.max_columns', None)

r = requests.get("https://statsapi.mlb.com/api/v1/schedule",
                 params={"sportId": 1,              # indicates Major League Baseball (AL + NL)
                         "startDate": "2023-01-01", # start of 2023
                         "endDate": "2023-12-01",   # end of 2023
                         "gameTypes": "R"})         # only regular season games, no postseason or spring training

# The query includes a list of dates, and each date has a list of games for that day.
query = json.loads(r.content)

# extract each game_pk
game_pks = []
for date in query['dates']:
    for game in date['games']:
        game_pks.append(game['gamePk'])

# get rid of duplicates (happens due to postponements, which double counts the game for the originally scheduled date and actual date of the game)
games = list(set(game_pks))

print(len(games))                               # total number of games
print(games[0], games[1], games[2], games[3])   # print out first four game_pks as a sample

2430
716352 716353 716354 716355

# This function takes in a `game_pk` and returns a DataFrame of all pitches corresponding to that game.
def get_game_pitches(game_pk):
    # download game data from gamefinder through a request
    r = requests.get("https://baseballsavant.mlb.com/gf", params={"game_pk": game_pk})
    game = json.loads(r.content)

    # collect all pitches
    # gamefinder sorts all pitch data by who is the team pitching (home/away) and has a list of all those pitches
    away_pitches = pd.DataFrame(game['team_away'])      # away team is pitching/ home team is batting
    home_pitches = pd.DataFrame(game['team_home'])      # home team is pitching/ away team is batting

    # assign top/bottom half of innings based on who is pitching
    away_pitches["half_inning"] = "bottom"
    home_pitches["half_inning"] = "top"

    # concatenate the two dataframes
    pitches = pd.concat([away_pitches, home_pitches], ignore_index = True)

    # place information about the game as a whole in each row
    pitches["away_team"]   = game["scoreboard"]["teams"]["away"]["name"]
    pitches["away_abbrev"] = game["scoreboard"]["teams"]["away"]["abbreviation"]
    pitches["home_team"]   = game["scoreboard"]["teams"]["home"]["name"]
    pitches["home_abbrev"] = game["scoreboard"]["teams"]["home"]["abbreviation"]

    pitches["date"]      = game["scoreboard"]["datetime"]["officialDate"]
    pitches["day_night"] = game["scoreboard"]["datetime"]["dayNight"]

    # we need to find who is the home plate umpire for this particular game
    # we search through the list of officials to find which one is designated "Home Plate"
    try:
        umpire = next((umpire for umpire in game['boxscore']['officials'] if umpire["officialType"] == "Home Plate"), None)["official"]["fullName"]
    except KeyError:
        umpire = None
    pitches["plate_umpire"] = umpire    # add as a new column

    pitches.drop(columns=["outs"], inplace=True)    # drop this column since it is inaccurate

    # get the remaining information through StatsAPI
    r = requests.get(f"https://statsapi.mlb.com/api/v1.1/game/{game_pk}/feed/live")
    game = json.loads(r.content)

    # this function extracts a dataframe of pitches from statsapi v1.1 data
    def get_pitches_play(play):
        df = pd.DataFrame(play['playEvents'])
        df['event_description'] = play['result']['event']
        df['event_type'] = play['result']['eventType']
        return df[df['isPitch'] == True]

    # get the pitches for all the plays
    # statsapi organizes pitches by plate appereance (i.e. a list of plate appearances, each with a list of pitches from that plate appearance)
    extra = pd.concat([get_pitches_play(play) for play in game['liveData']['plays']['allPlays']], axis=0, ignore_index = True)

    # the data is stored a dictionary of dictionaries, so we flatten all these so that all the parameters are at the same level
    extra = pd.concat([extra.drop(['count'], axis=1), extra['count'].apply(pd.Series)], axis=1)
    extra = pd.concat([extra.drop(['details'], axis=1), extra['details'].apply(pd.Series)], axis=1)
    extra = pd.concat([extra.drop(['pitchData'], axis=1), extra['pitchData'].apply(pd.Series)], axis=1)
    extra = pd.concat([extra.drop(['breaks'], axis=1), extra['breaks'].apply(pd.Series)], axis=1)

    # the `runnerGoing` column is either True or blank (NA), so we fill the NAs with false
    try:
        extra['runner_going'] = extra['runnerGoing'].fillna(False)
    except KeyError:
        extra['runner_going'] = False
    
    # set extra break information to NaN if it is not available
    if 'spinDirection' not in extra:
        extra['spinDirection'] = np.NaN
    if 'breakVertical' not in extra:
        extra['breakVertical'] = np.NaN
        extra['breakHorizontal'] = np.NaN
        extra['breakLength'] = np.NaN
        extra['breakAngle'] = np.NaN

    # rename `play_id` for easier merging
    extra.rename({"playId": "play_id"}, inplace=True, axis=1)
    # extract just the extra columns we want from statsapi
    extra = extra[['play_id', 'outs', 'runner_going', 'spinDirection', 'event_type', 'event_description',
                   'breakVertical', 'breakHorizontal', 'breakLength', 'breakAngle']]    
 
    # merge the two dataframes together    
    return pd.merge(pitches, extra, how="left", on=["play_id"])

game_pitches = list(map(get_game_pitches, games))

all_pitches = pd.concat(game_pitches, ignore_index = True)
all_pitches.set_index("play_id", inplace=True)
all_pitches.head()

C:\Users\Matthew\AppData\Local\Temp\ipykernel_68740\602634917.py:3: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
  all_pitches = pd.concat(game_pitches, ignore_index = True)

all_pitches.rename({'stand': 'batter_side',
                    'p_throws': 'pitcher_hand',
                    'pitch_type': 'pitch_code',
                    'pitch_name': 'pitch_description',
                    'description': 'result_description',
                    'plateTime': 'plate_time',
                    'zone': 'gameday_zone',
                    'pfxX': 'pfx_x', 'pfxZ': 'pfx_z',
                    'breakHorizontal': 'break_x', 'breakVertical': 'break_z',
                    'breakLength': 'break_length', 'breakAngle': 'break_angle',
                    "spinDirection": "spin_axis",
                    'runnerOn1B': 'runner_1b',
                    'runnerOn2B': 'runner_2b',
                    'runnerOn3B': 'runner_3b'}, axis=1,inplace=True)

# reorder columns and drop the columns that we don't want
all_pitches = all_pitches[['game_pk', 'date', 'day_night',
                           'away_team', 'away_abbrev', 'home_team', 'home_abbrev',
                           'game_total_pitches',
                           'inning', 'half_inning', 'outs',
                           'runner_1b', 'runner_2b', 'runner_3b', 'runner_going',
                           'batter_name', 'batter_side', 'pitcher_name', 'pitcher_hand',
                           'pitch_number',
                           'balls', 'strikes',
                           'result_code', 'result_description',
                           'px', 'pz', 'sz_top', 'sz_bot',
                           'gameday_zone',
                           'pitch_code', 'pitch_description',
                           'start_speed', 'end_speed', 'plate_time',
                           'x0', 'y0', 'z0',
                           'vx0', 'vy0', 'vz0',
                           'ax', 'ay', 'az',
                           'pfx_x', 'pfx_z',
                           'spin_rate', 'spin_axis',
                           'break_x', 'break_z',
                           'break_length', 'break_angle',
                           'event_type', 'event_description',
                           'plate_umpire']]

all_pitches["runner_1b"] = all_pitches["runner_1b"].fillna(False)
all_pitches["runner_2b"] = all_pitches["runner_2b"].fillna(False)
all_pitches["runner_3b"] = all_pitches["runner_3b"].fillna(False)

all_pitches.head()

result_codes = ["*B", "B", "C", "H", "I", "P"]
taken_pitches = all_pitches[all_pitches['result_code'].isin(result_codes)]
taken_pitches.head()

print("Number of taken pitches including incomplete observations:", len(taken_pitches))
taken_pitches = taken_pitches.dropna()
print("Number of taken pitches after removing incomplete observations:", len(taken_pitches))
taken_pitches.head()

Number of taken pitches including incomplete observations: 376681
Number of taken pitches after removing incomplete observations: 374426

def expected_strike(zone):
    return int(zone) < 10

def actual_strike(result_code):
    return result_code == "C"

taken_pitches['expected_call'] = taken_pitches['gameday_zone'].apply(expected_strike)
taken_pitches['actual_call']   = taken_pitches['result_code'].apply(actual_strike)
taken_pitches['correct_call']  = taken_pitches['expected_call'] == taken_pitches['actual_call']

print("Number of pitches thrown in 2023:", len(all_pitches))
print("Average pitches thrown per game in 2023:", len(all_pitches) / 2430)
print("Number of taken pitches in 2023:", len(taken_pitches))
print("Average taken pitches per game in 2023:", len(taken_pitches) / 2430)
print("Percent of all pitches taken in 2023:", len(taken_pitches) / len(all_pitches) * 100)

Number of pitches thrown in 2023: 717945
Average pitches thrown per game in 2023: 295.45061728395063
Number of taken pitches in 2023: 374426
Average taken pitches per game in 2023: 154.08477366255144
Percent of all pitches taken in 2023: 52.15246293239733

print("Number of incorrect calls in 2023:", len(taken_pitches[taken_pitches["correct_call"] == False]))
print("Average incorrect calls per game in 2023:", len(taken_pitches[taken_pitches["correct_call"] == False])/ 2430)
overall_accuracy = taken_pitches["correct_call"].mean() * 100
print("Overall umpire accuracy in 2023:", overall_accuracy)
print("Overall bad call ratio (BCR) in 2023:", 100 - (taken_pitches["correct_call"].mean() * 100))

Number of incorrect calls in 2023: 26741
Average incorrect calls per game in 2023: 11.004526748971193
Overall umpire accuracy in 2023: 92.85813485174641
Overall bad call ratio (BCR) in 2023: 7.141865148253586

plt.hist(all_pitches["start_speed"], bins=np.arange(75, 101, 1))
plt.ylabel("Count of pitches in 2023")
plt.xlabel("Release speed of pitch (mph)")
plt.title("Histogram of pitch release speeds of taken pitches in 2023")
plt.show()

taken_pitches["speed"] = pd.cut(taken_pitches["start_speed"], bins=np.arange(75, 101, 1))
speed_pitches = taken_pitches.groupby("speed", observed=False)
plt.plot(np.arange(76, 101, 1), speed_pitches["correct_call"].mean() * 100)
plt.plot([75, 100], [overall_accuracy, overall_accuracy], linestyle='dashed')
plt.xlabel("Release speed of pitch (mph)")
plt.ylabel("Umpire accuracy on called pitches")
plt.title("Umpire accuracy vs. pitch speed")
plt.show()

pitch_types = pd.DataFrame({
                        'name': pitch_code,
                        'num_calls': pitches['correct_call'].count(),
                        'accuracy': pitches['correct_call'].mean() * 100,
                    } for pitch_code, pitches in taken_pitches.groupby("pitch_code"))
pitch_types = pitch_types[pitch_types["num_calls"] > 1000].sort_values(by="num_calls", ascending=False)

plt.bar(pitch_types['name'], pitch_types['num_calls'],
        color=['red', 'green', 'red', 'blue', 'green', 'red', 'green', 'blue', 'green', 'green'])
plt.xlabel("Pitch type")
plt.ylabel("Number of calls")
plt.title("Number of calls by pitch type")
plt.plot()

[]

pitch_types = pd.DataFrame({
                        'name': pitch_code,
                        'num_calls': pitches['correct_call'].count(),
                        'accuracy': pitches['correct_call'].mean() * 100,
                    } for pitch_code, pitches in taken_pitches.groupby("pitch_code"))
pitch_types = pitch_types[pitch_types["num_calls"] > 1000].sort_values(by="accuracy", ascending=False)

plt.bar(pitch_types['name'], pitch_types['accuracy'],
        color=['blue', 'blue', 'green', 'green', 'green', 'green', 'green', 'red', 'red', 'red'])
plt.plot([-0.5, 9.5], [overall_accuracy, overall_accuracy], linestyle='dashed', color='orange')
plt.ylim(90, 96)
plt.xlabel("Pitch type")
plt.ylabel("Umpire accuracy on called pitches")
plt.title("Umpire accuracy vs. pitch type")
plt.plot()

[]

SZ_TOP = 3.40
SZ_BOT = 1.60
RAD = 0.12

def normalized_pz(df):
    pz = df['pz']
    sz_top = df['sz_top']
    sz_bot = df['sz_bot']

    if pz > sz_top - RAD:
        return pz - sz_top + SZ_TOP
    elif pz < sz_bot + RAD:
        return pz - sz_bot + SZ_BOT
    else:
        return SZ_BOT + (SZ_TOP - SZ_BOT) * ((pz - sz_bot) / (sz_top - sz_bot))

def normalized_px(df):
    pz = df['px']
    side = df['batter_side']

    return -pz if side == "L" else pz

taken_pitches["px_norm"] = taken_pitches.apply(normalized_px, axis=1)
taken_pitches["pz_norm"] = taken_pitches.apply(normalized_pz, axis=1)

PLATE_EDGE = 17/24 + 0.12
MIDDLE = (3.4 + 1.6) / 2
HALF_HEIGHT = 1.02

taken_pitches["horiz_zone"] = pd.cut(taken_pitches["px_norm"], bins=np.linspace(-PLATE_EDGE * 2, PLATE_EDGE * 2, 33))
taken_pitches["vert_zone"]  = pd.cut(taken_pitches["pz_norm"], bins=np.linspace(MIDDLE - HALF_HEIGHT * 2, MIDDLE + HALF_HEIGHT * 2, 33))

zoned_pitches = taken_pitches.groupby(["vert_zone", "horiz_zone"], observed=False)["expected_call"].mean().unstack()[::-1]
sns.heatmap(zoned_pitches)

plt.gca().set_aspect(0.255/0.207)       # set square aspect ratio
plt.plot([8,8,24,24,8],[8,24,24,8,8], color="green")
plt.xticks([], [])
plt.xlabel("Normalized horizontal location")
plt.yticks([], [])
plt.ylabel("Normalized vertical location")
plt.title("Heatmap of expected calls on pitches")
plt.show()

zoned_pitches = taken_pitches.groupby(["vert_zone", "horiz_zone"], observed=False)["actual_call"].mean().unstack()[::-1]
sns.heatmap(zoned_pitches)

plt.gca().set_aspect(0.255/0.207)       # set square aspect ratio
plt.plot([8,8,24,24,8],[8,24,24,8,8], color="green")
plt.xticks([], [])
plt.xlabel("Normalized horizontal location")
plt.yticks([], [])
plt.ylabel("Normalized vertical location")
plt.title("Heatmap of expected calls on pitches")
plt.show()

zoned_pitches_strike2 = taken_pitches[taken_pitches["strikes"] == 2].groupby(["vert_zone", "horiz_zone"], observed=False)["actual_call"].mean().unstack()[::-1]
sns.heatmap(zoned_pitches_strike2)

plt.gca().set_aspect(0.255/0.207)       # set square aspect ratio
plt.plot([8,8,24,24,8],[8,24,24,8,8], color="green")
plt.xticks([], [])
plt.xlabel("Normalized horizontal location")
plt.yticks([], [])
plt.ylabel("Normalized vertical location")
plt.title("Heatmap of expected calls on pitches with 2 strikes")
plt.show()

lim = (zoned_pitches_strike2 - zoned_pitches).abs().max().max()
sns.heatmap(zoned_pitches_strike2 - zoned_pitches, cmap="coolwarm", vmin=-lim, vmax=lim)

plt.gca().set_aspect(0.255/0.207)       # set square aspect ratio
plt.plot([8,8,24,24,8],[8,24,24,8,8], color="green")
plt.xticks([], [])
plt.xlabel("Normalized horizontal location")
plt.yticks([], [])
plt.ylabel("Normalized vertical location")
plt.title("Difference of expected calls on pitches with 2 strikes")
plt.show()

feature_cols = ['px_norm', 'pz', 'sz_top', 'sz_bot', 'break_x', 'break_z', 'start_speed', 'balls', 'strikes']
X = taken_pitches[feature_cols] # Features
y = taken_pitches.actual_call

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))

Training set size: 299540
Testing set size: 74886

tree_model = DecisionTreeClassifier()
tree_model = tree_model.fit(X_train, y_train)

y_pred_tree = tree_model.predict(X_test)

print("Accuracy: ", metrics.accuracy_score(y_test, y_pred_tree))

Accuracy:  0.9090751275271747

print(classification_report(y_test, y_pred_tree))

              precision    recall  f1-score   support

       False       0.93      0.93      0.93     51445
        True       0.85      0.85      0.85     23441

    accuracy                           0.91     74886
   macro avg       0.89      0.89      0.89     74886
weighted avg       0.91      0.91      0.91     74886

class_names=["False", "True"] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
cnf_matrix_tree = metrics.confusion_matrix(y_test, y_pred_tree)
sns.heatmap(pd.DataFrame(cnf_matrix_tree), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

Text(0.5, 427.9555555555555, 'Predicted label')

y_pred_proba_tree = tree_model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba_tree)
auc = metrics.roc_auc_score(y_test, y_pred_proba_tree)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.title("Receiver operating characteristics curve")
plt.show()

plt.figure(figsize=(20,10))
tree.plot_tree(tree_model, filled=True, fontsize=10, feature_names=feature_cols, max_depth=3)
plt.show()

# Checking feature importance
importances = pd.DataFrame({'feature': X_train[feature_cols].columns, 'importance': tree_model.feature_importances_}).sort_values('importance', ascending=False)
print(importances)

       feature  importance
0      px_norm    0.444101
1           pz    0.397211
2       sz_top    0.035773
3       sz_bot    0.035729
4      break_x    0.027359
5      break_z    0.024478
6  start_speed    0.023337
7        balls    0.006391
8      strikes    0.005621

r = requests.get("https://statsapi.mlb.com/api/v1/schedule",
                 params={"sportId": 1,              # indicates Major League Baseball (AL + NL)
                         "startDate": "2023-01-01", # start of 2023
                         "endDate": "2023-12-01",   # end of 2023
                         "gameTypes": ["F", "D", "L", "W"]})
# postseason game types include
#   F = wild card series
#   D = divison series
#   L = league championship series
#   W = world series

# The query includes a list of dates, and each date has a list of games for that day.
query = json.loads(r.content)

# extract each game_pk
game_pks = []
for date in query['dates']:
    for game in date['games']:
        game_pks.append(game['gamePk'])

# get rid of duplicates (happens due to postponements, which double counts the game for the originally scheduled date and actual date of the game)
games = list(set(game_pks))

print(len(games))                               # total number of games

41

game_pitches = list(map(get_game_pitches, games))

postseason_pitches = pd.concat(game_pitches, ignore_index = True)
postseason_pitches.set_index("play_id", inplace=True)
postseason_pitches.head()

postseason_pitches.rename({'stand': 'batter_side',
                    'p_throws': 'pitcher_hand',
                    'pitch_type': 'pitch_code',
                    'pitch_name': 'pitch_description',
                    'description': 'result_description',
                    'plateTime': 'plate_time',
                    'zone': 'gameday_zone',
                    'pfxX': 'pfx_x', 'pfxZ': 'pfx_z',
                    'breakHorizontal': 'break_x', 'breakVertical': 'break_z',
                    'breakLength': 'break_length', 'breakAngle': 'break_angle',
                    "spinDirection": "spin_axis",
                    'runnerOn1B': 'runner_1b',
                    'runnerOn2B': 'runner_2b',
                    'runnerOn3B': 'runner_3b'}, axis=1,inplace=True)
# reorder columns and drop the columns that we don't want
postseason_pitches = postseason_pitches[['game_pk', 'date', 'day_night',
                           'away_team', 'away_abbrev', 'home_team', 'home_abbrev',
                           'game_total_pitches',
                           'inning', 'half_inning', 'outs',
                           'runner_1b', 'runner_2b', 'runner_3b', 'runner_going',
                           'batter_name', 'batter_side', 'pitcher_name', 'pitcher_hand',
                           'pitch_number',
                           'balls', 'strikes',
                           'result_code', 'result_description',
                           'px', 'pz', 'sz_top', 'sz_bot',
                           'gameday_zone',
                           'pitch_code', 'pitch_description',
                           'start_speed', 'end_speed', 'plate_time',
                           'x0', 'y0', 'z0',
                           'vx0', 'vy0', 'vz0',
                           'ax', 'ay', 'az',
                           'pfx_x', 'pfx_z',
                           'spin_rate', 'spin_axis',
                           'break_x', 'break_z',
                           'break_length', 'break_angle',
                           'event_type', 'event_description',
                           'plate_umpire']]
postseason_pitches["runner_1b"] = postseason_pitches["runner_1b"].fillna(False)
postseason_pitches["runner_2b"] = postseason_pitches["runner_2b"].fillna(False)
postseason_pitches["runner_3b"] = postseason_pitches["runner_3b"].fillna(False)
result_codes = ["*B", "B", "C", "H", "I", "P"]
postseason_taken_pitches = postseason_pitches[postseason_pitches['result_code'].isin(result_codes)]
postseason_taken_pitches = postseason_taken_pitches.dropna()

postseason_taken_pitches['expected_call'] = postseason_taken_pitches['gameday_zone'].apply(expected_strike)
postseason_taken_pitches['actual_call']   = postseason_taken_pitches['result_code'].apply(actual_strike)
postseason_taken_pitches['correct_call']  = postseason_taken_pitches['expected_call'] == postseason_taken_pitches['actual_call']

postseason_taken_pitches["px_norm"] = postseason_taken_pitches.apply(normalized_px, axis=1)
postseason_taken_pitches["pz_norm"] = postseason_taken_pitches.apply(normalized_pz, axis=1)

print("Number of taken pitches in the 2023 postseason:", len(postseason_pitches))
postseason_taken_pitches.head()

Number of taken pitches in the 2023 postseason: 11829

postseason_taken_pitches["pred_call"] = tree_model.predict(postseason_taken_pitches[feature_cols])
def credit(pitch):
    if pitch["pred_call"] == pitch["actual_call"]:
        return 0
    else:
        if pitch['pred_call'] == pitch['expected_call']:
            return -1
        else:
            return 1
postseason_taken_pitches["credit"] = postseason_taken_pitches.apply(credit, axis=1)

umpire_stats = pd.DataFrame({
                        'umpire_name': umpire,
                        'called': len(pitches),
                        'num_inc': len(pitches[pitches['correct_call'] == False]),
                        'accuracy': pitches['correct_call'].mean(),
                        'calls_above_avg': pitches['credit'].sum()
                    } for umpire, pitches in postseason_taken_pitches.groupby("plate_umpire"))
umpire_stats.set_index("umpire_name", inplace=True)

umpire_stats.sort_values("accuracy", ascending=False).head(5)

umpire_stats.sort_values("accuracy", ascending=True).head(5)

umpire_stats.sort_values("calls_above_avg", ascending=False).head(5)

umpire_stats.sort_values("calls_above_avg", ascending=True).head(5)

plt.scatter(umpire_stats["accuracy"], umpire_stats["calls_above_avg"])


reg = linear_model.LinearRegression()
reg.fit(umpire_stats["accuracy"].values.reshape(-1, 1), umpire_stats["calls_above_avg"])

def regression(x):
    return reg.coef_[0] * x + reg.intercept_

# plot the regression line
plt.axline((0.875, regression(0.875)), (1, regression(1)))

plt.title("Correct calls above average vs. accuracy")
plt.xlabel("Accuracy")
plt.ylabel("Correct calls above average")
plt.show()

	inning	ab_number	cap_index	batter	stand	batter_name	pitcher	p_throws	pitcher_name	team_batting	team_fielding	team_batting_id	team_fielding_id	result	des	events	contextMetrics	strikes	balls	pre_strikes	pre_balls	call	call_name	pitch_type	pitch_name	description	result_code	pitch_call	is_strike_swinging	balls_and_strikes	start_speed	end_speed	sz_top	sz_bot	extension	plateTime	zone	spin_rate	px	pz	x0	y0	z0	ax	ay	az	vx0	vy0	vz0	pfxX	pfxZ	pfxZWithGravity	pfxZWithGravityNice	pfxZDirection	pfxXWithGravity	pfxXNoAbs	pfxXDirection	breakX	breakZ	inducedBreakZ	is_bip_out	pitch_number	player_total_pitches	player_total_pitches_pitch_types	game_total_pitches	rowId	game_pk	player_name	hit_speed_round	hit_speed	hit_distance	xba	hit_angle	is_barrel	hc_x	hc_x_ft	hc_y	hc_y_ft	runnerOn1B	runnerOn2B	runnerOn3B	half_inning	away_team	away_abbrev	home_team	home_abbrev	date	day_night	plate_umpire	outs	runner_going	spinDirection	event_type	event_description	breakVertical	breakHorizontal	breakLength	breakAngle
play_id
fd40dc91-e458-47d4-81d4-bdd654781440	1	6	0	672580	R	Maikel Garcia	650633	R	Michael King	KC	NYY	118	147	Lineout	Maikel Garcia lines out to center fielder Este...	Lineout	{}	0	0	0	0	S	Strike	SI	Sinker	Called Strike	C	called_strike	False	00	92.0	85.2	3.531020	1.612927	5.922587	0.407594	7.0	2491.0	-0.292692	2.207051	-2.461166	50.003600	5.557445	-17.086921	25.264212	-25.635070	8.973728	-133.710840	-4.062951	-9.312735	3.563786	-26.040910	26.0	↓	16	-16	→	15.0	27.0	5.0	Y	1	1	1	23	23-716352	716352	NYY	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	bottom	New York Yankees	NYY	Kansas City Royals	KC	2023-10-01	day	Clint Vondrak	0	False	200.0	field_out	Lineout	-26.2	15.2	7.2	30.0
60d12f55-1c47-42da-8f3e-d3e2e6e47ee2	1	6	0	672580	R	Maikel Garcia	650633	R	Michael King	KC	NYY	118	147	Lineout	Maikel Garcia lines out to center fielder Este...	Lineout	{}	1	0	1	0	S	Strike	FF	4-Seam Fastball	Swinging Strike	S	swinging_strike	True	01	93.8	86.9	3.590000	1.730000	5.691580	0.398518	11.0	2330.0	-1.158241	3.385914	-2.388350	50.006551	5.757462	-6.665766	26.327695	-16.695794	4.566470	-136.737542	-3.362402	-3.472281	8.067414	-17.017648	17.0	↓	6	-6	→	6.0	17.0	14.0	Y	2	2	1	24	24-716352	716352	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	bottom	New York Yankees	NYY	Kansas City Royals	KC	2023-10-01	day	Clint Vondrak	0	False	212.0	field_out	Lineout	-16.4	5.5	3.6	18.0
50bd9d0f-eee5-4f99-9dc4-a1fa02594bf9	1	6	0	672580	R	Maikel Garcia	650633	R	Michael King	KC	NYY	118	147	Lineout	Maikel Garcia lines out to center fielder Este...	Lineout	{'homeRunBallparks': 0}	2	0	2	0	X	In Play	FF	4-Seam Fastball	In play, out(s)	X	hit_into_play	False	02	94.3	87.4	3.590000	1.730000	6.028961	0.396397	11.0	2377.0	-0.008739	3.769854	-2.256394	50.001334	5.865596	-6.578440	25.957664	-16.315596	7.339570	-137.357644	-2.731021	-3.389510	8.170515	-16.518106	17.0	↓	6	-6	→	5.0	16.0	14.0	Y	3	3	2	25	25-716352	716352	NaN	99.0	99.4	360	.410	18	0.0	114.37	-26.104127	54.75	358.29979	NaN	NaN	NaN	bottom	New York Yankees	NYY	Kansas City Royals	KC	2023-10-01	day	Clint Vondrak	0	False	203.0	field_out	Lineout	-15.8	4.9	3.6	16.8
6c79e8ff-4972-4ad1-85cb-f994727c1f90	1	7	0	677951	R	Bobby Witt Jr.	650633	R	Michael King	KC	NYY	118	147	Groundout	Bobby Witt Jr. grounds out, second baseman Os...	Groundout	{}	0	0	0	0	S	Strike	SI	Sinker	Called Strike	C	called_strike	False	00	93.5	87.0	3.403536	1.631027	5.979491	0.400484	9.0	1859.0	0.311409	1.618673	-2.256811	50.004739	5.486336	-17.570281	25.019013	-22.974243	10.189194	-135.885978	-6.196413	-9.236481	4.838141	-22.778260	23.0	↓	16	-16	→	15.0	23.0	7.0	Y	1	4	2	26	26-716352	716352	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	bottom	New York Yankees	NYY	Kansas City Royals	KC	2023-10-01	day	Clint Vondrak	1	False	165.0	field_out	Groundout	-23.2	15.0	6.0	33.6
e3f2d82a-9778-43aa-a813-05f91eb2122e	1	7	0	677951	R	Bobby Witt Jr.	650633	R	Michael King	KC	NYY	118	147	Groundout	Bobby Witt Jr. grounds out, second baseman Os...	Groundout	{}	1	0	1	0	B	Ball	SI	Sinker	Ball	B	ball	False	01	94.0	86.8	3.491020	1.660077	6.095591	0.398597	11.0	2321.0	-1.375257	2.676168	-2.402972	50.001801	5.610981	-15.254782	27.280516	-20.179332	5.599407	-136.873132	-4.246522	-7.957966	6.256626	-20.090125	20.0	↓	13	-13	→	14.0	20.0	11.0	Y	2	5	3	27	27-716352	716352	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	bottom	New York Yankees	NYY	Kansas City Royals	KC	2023-10-01	day	Clint Vondrak	1	False	215.0	field_out	Groundout	-19.9	13.4	4.8	33.6

	game_pk	date	day_night	away_team	away_abbrev	home_team	home_abbrev	game_total_pitches	inning	half_inning	outs	runner_1b	runner_2b	runner_3b	runner_going	batter_name	batter_side	pitcher_name	pitcher_hand	pitch_number	balls	strikes	result_code	result_description	px	pz	sz_top	sz_bot	gameday_zone	pitch_code	pitch_description	start_speed	end_speed	plate_time	x0	y0	z0	vx0	vy0	vz0	ax	ay	az	pfx_x	pfx_z	spin_rate	spin_axis	break_x	break_z	break_length	break_angle	event_type	event_description	plate_umpire
play_id
fd40dc91-e458-47d4-81d4-bdd654781440	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	23	1	bottom	0	False	False	False	False	Maikel Garcia	R	Michael King	R	1	0	0	C	Called Strike	-0.292692	2.207051	3.531020	1.612927	7.0	SI	Sinker	92.0	85.2	0.407594	-2.461166	50.003600	5.557445	8.973728	-133.710840	-4.062951	-17.086921	25.264212	-25.635070	-9.312735	3.563786	2491.0	200.0	15.2	-26.2	7.2	30.0	field_out	Lineout	Clint Vondrak
60d12f55-1c47-42da-8f3e-d3e2e6e47ee2	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	24	1	bottom	0	False	False	False	False	Maikel Garcia	R	Michael King	R	2	0	1	S	Swinging Strike	-1.158241	3.385914	3.590000	1.730000	11.0	FF	4-Seam Fastball	93.8	86.9	0.398518	-2.388350	50.006551	5.757462	4.566470	-136.737542	-3.362402	-6.665766	26.327695	-16.695794	-3.472281	8.067414	2330.0	212.0	5.5	-16.4	3.6	18.0	field_out	Lineout	Clint Vondrak
50bd9d0f-eee5-4f99-9dc4-a1fa02594bf9	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	25	1	bottom	0	False	False	False	False	Maikel Garcia	R	Michael King	R	3	0	2	X	In play, out(s)	-0.008739	3.769854	3.590000	1.730000	11.0	FF	4-Seam Fastball	94.3	87.4	0.396397	-2.256394	50.001334	5.865596	7.339570	-137.357644	-2.731021	-6.578440	25.957664	-16.315596	-3.389510	8.170515	2377.0	203.0	4.9	-15.8	3.6	16.8	field_out	Lineout	Clint Vondrak
6c79e8ff-4972-4ad1-85cb-f994727c1f90	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	26	1	bottom	1	False	False	False	False	Bobby Witt Jr.	R	Michael King	R	1	0	0	C	Called Strike	0.311409	1.618673	3.403536	1.631027	9.0	SI	Sinker	93.5	87.0	0.400484	-2.256811	50.004739	5.486336	10.189194	-135.885978	-6.196413	-17.570281	25.019013	-22.974243	-9.236481	4.838141	1859.0	165.0	15.0	-23.2	6.0	33.6	field_out	Groundout	Clint Vondrak
e3f2d82a-9778-43aa-a813-05f91eb2122e	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	27	1	bottom	1	False	False	False	False	Bobby Witt Jr.	R	Michael King	R	2	0	1	B	Ball	-1.375257	2.676168	3.491020	1.660077	11.0	SI	Sinker	94.0	86.8	0.398597	-2.402972	50.001801	5.610981	5.599407	-136.873132	-4.246522	-15.254782	27.280516	-20.179332	-7.957966	6.256626	2321.0	215.0	13.4	-19.9	4.8	33.6	field_out	Groundout	Clint Vondrak

	game_pk	date	day_night	away_team	away_abbrev	home_team	home_abbrev	game_total_pitches	inning	half_inning	outs	runner_1b	runner_2b	runner_3b	runner_going	batter_name	batter_side	pitcher_name	pitcher_hand	pitch_number	balls	strikes	result_code	result_description	px	pz	sz_top	sz_bot	gameday_zone	pitch_code	pitch_description	start_speed	end_speed	plate_time	x0	y0	z0	vx0	vy0	vz0	ax	ay	az	pfx_x	pfx_z	spin_rate	spin_axis	break_x	break_z	break_length	break_angle	event_type	event_description	plate_umpire
play_id
fd40dc91-e458-47d4-81d4-bdd654781440	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	23	1	bottom	0	False	False	False	False	Maikel Garcia	R	Michael King	R	1	0	0	C	Called Strike	-0.292692	2.207051	3.531020	1.612927	7.0	SI	Sinker	92.0	85.2	0.407594	-2.461166	50.003600	5.557445	8.973728	-133.710840	-4.062951	-17.086921	25.264212	-25.635070	-9.312735	3.563786	2491.0	200.0	15.2	-26.2	7.2	30.0	field_out	Lineout	Clint Vondrak
6c79e8ff-4972-4ad1-85cb-f994727c1f90	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	26	1	bottom	1	False	False	False	False	Bobby Witt Jr.	R	Michael King	R	1	0	0	C	Called Strike	0.311409	1.618673	3.403536	1.631027	9.0	SI	Sinker	93.5	87.0	0.400484	-2.256811	50.004739	5.486336	10.189194	-135.885978	-6.196413	-17.570281	25.019013	-22.974243	-9.236481	4.838141	1859.0	165.0	15.0	-23.2	6.0	33.6	field_out	Groundout	Clint Vondrak
e3f2d82a-9778-43aa-a813-05f91eb2122e	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	27	1	bottom	1	False	False	False	False	Bobby Witt Jr.	R	Michael King	R	2	0	1	B	Ball	-1.375257	2.676168	3.491020	1.660077	11.0	SI	Sinker	94.0	86.8	0.398597	-2.402972	50.001801	5.610981	5.599407	-136.873132	-4.246522	-15.254782	27.280516	-20.179332	-7.957966	6.256626	2321.0	215.0	13.4	-19.9	4.8	33.6	field_out	Groundout	Clint Vondrak
38494e8c-a6db-48ae-93b0-08f09e346853	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	29	1	bottom	1	False	False	False	False	Bobby Witt Jr.	R	Michael King	R	4	1	2	B	Ball	1.779492	1.317006	3.491020	1.718184	14.0	ST	Sweeper	81.9	76.2	0.458978	-2.404842	50.003641	5.380935	7.232091	-119.132844	-2.905202	12.379237	21.944713	-31.434472	8.574210	0.514236	2896.0	60.0	-17.3	-40.2	9.6	22.8	field_out	Groundout	Clint Vondrak
2334d79b-c2c7-4421-89b2-10cf4724d1d0	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	31	1	bottom	2	False	False	False	False	Salvador Perez	R	Michael King	R	1	0	0	C	Called Strike	0.448960	3.073323	3.540189	1.649124	3.0	SI	Sinker	93.9	87.2	0.398503	-2.280361	50.006529	5.488510	10.592485	-136.501232	-2.116903	-17.286584	24.924189	-24.112642	-8.994766	4.199174	2288.0	225.0	14.5	-23.3	6.0	32.4	strikeout	Strikeout	Clint Vondrak

	game_pk	date	day_night	away_team	away_abbrev	home_team	home_abbrev	game_total_pitches	inning	half_inning	outs	runner_1b	runner_2b	runner_3b	runner_going	batter_name	batter_side	pitcher_name	pitcher_hand	pitch_number	balls	strikes	result_code	result_description	px	pz	sz_top	sz_bot	gameday_zone	pitch_code	pitch_description	start_speed	end_speed	plate_time	x0	y0	z0	vx0	vy0	vz0	ax	ay	az	pfx_x	pfx_z	spin_rate	spin_axis	break_x	break_z	break_length	break_angle	event_type	event_description	plate_umpire
play_id
fd40dc91-e458-47d4-81d4-bdd654781440	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	23	1	bottom	0	False	False	False	False	Maikel Garcia	R	Michael King	R	1	0	0	C	Called Strike	-0.292692	2.207051	3.531020	1.612927	7.0	SI	Sinker	92.0	85.2	0.407594	-2.461166	50.003600	5.557445	8.973728	-133.710840	-4.062951	-17.086921	25.264212	-25.635070	-9.312735	3.563786	2491.0	200.0	15.2	-26.2	7.2	30.0	field_out	Lineout	Clint Vondrak
6c79e8ff-4972-4ad1-85cb-f994727c1f90	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	26	1	bottom	1	False	False	False	False	Bobby Witt Jr.	R	Michael King	R	1	0	0	C	Called Strike	0.311409	1.618673	3.403536	1.631027	9.0	SI	Sinker	93.5	87.0	0.400484	-2.256811	50.004739	5.486336	10.189194	-135.885978	-6.196413	-17.570281	25.019013	-22.974243	-9.236481	4.838141	1859.0	165.0	15.0	-23.2	6.0	33.6	field_out	Groundout	Clint Vondrak
e3f2d82a-9778-43aa-a813-05f91eb2122e	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	27	1	bottom	1	False	False	False	False	Bobby Witt Jr.	R	Michael King	R	2	0	1	B	Ball	-1.375257	2.676168	3.491020	1.660077	11.0	SI	Sinker	94.0	86.8	0.398597	-2.402972	50.001801	5.610981	5.599407	-136.873132	-4.246522	-15.254782	27.280516	-20.179332	-7.957966	6.256626	2321.0	215.0	13.4	-19.9	4.8	33.6	field_out	Groundout	Clint Vondrak
38494e8c-a6db-48ae-93b0-08f09e346853	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	29	1	bottom	1	False	False	False	False	Bobby Witt Jr.	R	Michael King	R	4	1	2	B	Ball	1.779492	1.317006	3.491020	1.718184	14.0	ST	Sweeper	81.9	76.2	0.458978	-2.404842	50.003641	5.380935	7.232091	-119.132844	-2.905202	12.379237	21.944713	-31.434472	8.574210	0.514236	2896.0	60.0	-17.3	-40.2	9.6	22.8	field_out	Groundout	Clint Vondrak
2334d79b-c2c7-4421-89b2-10cf4724d1d0	716352	2023-10-01	day	New York Yankees	NYY	Kansas City Royals	KC	31	1	bottom	2	False	False	False	False	Salvador Perez	R	Michael King	R	1	0	0	C	Called Strike	0.448960	3.073323	3.540189	1.649124	3.0	SI	Sinker	93.9	87.2	0.398503	-2.280361	50.006529	5.488510	10.592485	-136.501232	-2.116903	-17.286584	24.924189	-24.112642	-8.994766	4.199174	2288.0	225.0	14.5	-23.3	6.0	32.4	strikeout	Strikeout	Clint Vondrak

	inning	ab_number	cap_index	batter	stand	batter_name	pitcher	p_throws	pitcher_name	team_batting	team_fielding	team_batting_id	team_fielding_id	result	des	events	contextMetrics	strikes	balls	pre_strikes	pre_balls	call	call_name	pitch_type	pitch_name	description	result_code	pitch_call	is_strike_swinging	balls_and_strikes	start_speed	end_speed	sz_top	sz_bot	extension	plateTime	zone	spin_rate	px	pz	x0	y0	z0	ax	ay	az	vx0	vy0	vz0	pfxX	pfxZ	pfxZWithGravity	pfxZWithGravityNice	pfxZDirection	pfxXWithGravity	pfxXNoAbs	pfxXDirection	breakX	breakZ	inducedBreakZ	hit_speed_round	hit_speed	hit_distance	xba	hit_angle	is_barrel	hc_x	hc_x_ft	hc_y	hc_y_ft	is_bip_out	pitch_number	player_total_pitches	player_total_pitches_pitch_types	game_total_pitches	rowId	game_pk	player_name	runnerOn1B	runnerOn2B	runnerOn3B	half_inning	away_team	away_abbrev	home_team	home_abbrev	date	day_night	plate_umpire	outs	runner_going	spinDirection	event_type	event_description	breakVertical	breakHorizontal	breakLength	breakAngle
play_id
4a069011-5958-4fc4-af7c-1a8e37395891	1	4	0	606466	R	Ketel Marte	650911	L	Cristopher Sánchez	AZ	PHI	109	143	Groundout	Ketel Marte grounds out, second baseman Bryson...	Groundout	{'homeRunBallparks': 0}	0	0	0	0	X	In Play	SI	Sinker	In play, out(s)	X	hit_into_play	False	00	92.2	84.4	3.540000	1.690000	6.791236	0.408719	9	1970	0.807688	2.162640	1.748492	50.002941	5.834315	19.856549	29.578406	-26.116662	-6.241417	-134.122073	-4.777216	10.922046	3.330782	-26.611854	27	↓	18	18	←	18.0	27.0	6.0	100.0	99.6	6	.110	-22	0.0	138.08	31.045509	124.83	189.381761	Y	1	1	1	23	23-748544	748544	PHI	NaN	NaN	NaN	bottom	Philadelphia Phillies	PHI	Arizona Diamondbacks	AZ	2023-10-20	night	Mike Muchlinski	0	False	117.0	field_out	Groundout	-27.1	-18.4	7.2	34.8
1c820466-a1e0-4c76-a43f-798929e1a444	1	5	0	682998	L	Corbin Carroll	650911	L	Cristopher Sánchez	AZ	PHI	109	143	Groundout	Corbin Carroll grounds out, pitcher Cristopher...	Groundout	{}	0	0	0	0	S	Strike	SI	Sinker	Called Strike	C	called_strike	False	00	92.9	85.1	3.170675	1.576307	7.080770	0.405783	4	2015	-0.554020	2.560673	1.719392	50.004482	5.804752	19.404949	28.987894	-24.830040	-9.698387	-134.912303	-3.986018	10.511402	3.978887	-25.055092	25	↓	18	18	←	17.0	25.0	7.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Y	1	2	2	24	24-748544	748544	NaN	NaN	NaN	NaN	bottom	Philadelphia Phillies	PHI	Arizona Diamondbacks	AZ	2023-10-20	night	Mike Muchlinski	1	False	115.0	field_out	Groundout	-25.2	-17.0	7.2	33.6
c1a0cdf7-d190-4be0-964c-bd4de540240e	1	5	0	682998	L	Corbin Carroll	650911	L	Cristopher Sánchez	AZ	PHI	109	143	Groundout	Corbin Carroll grounds out, pitcher Cristopher...	Groundout	{}	1	0	1	0	S	Strike	SI	Sinker	Foul	F	foul	False	01	92.8	85.1	3.140000	1.480000	7.046580	0.406061	5	1934	-0.090677	2.041874	1.807166	50.001698	5.771773	18.373196	29.081882	-25.919565	-8.503464	-134.844971	-5.066663	9.968781	3.392270	-26.089912	26	↓	17	17	←	16.0	26.0	6.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Y	2	3	3	25	25-748544	748544	NaN	NaN	NaN	NaN	bottom	Philadelphia Phillies	PHI	Arizona Diamondbacks	AZ	2023-10-20	night	Mike Muchlinski	1	False	111.0	field_out	Groundout	-26.6	-16.2	7.2	31.2
cd48ca82-50a3-4603-a9fa-8a8cbce8944e	1	5	0	682998	L	Corbin Carroll	650911	L	Cristopher Sánchez	AZ	PHI	109	143	Groundout	Corbin Carroll grounds out, pitcher Cristopher...	Groundout	{}	2	0	2	0	B	Ball	CH	Changeup	Ball	B	ball	False	02	82.9	76.4	3.170675	1.477469	7.000006	0.455212	14	1953	0.113231	0.314335	1.933014	50.001007	5.593012	13.870536	23.808931	-31.199647	-7.242118	-120.413762	-5.970624	9.464740	0.660322	-38.877095	39	↓	16	16	←	15.0	39.0	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Y	3	4	1	26	26-748544	748544	NaN	NaN	NaN	NaN	bottom	Philadelphia Phillies	PHI	Arizona Diamondbacks	AZ	2023-10-20	night	Mike Muchlinski	1	False	98.0	field_out	Groundout	-40.1	-15.4	9.6	21.6
3c3f127a-a0e8-46bb-903e-9b8a0afbbca6	1	5	0	682998	L	Corbin Carroll	650911	L	Cristopher Sánchez	AZ	PHI	109	143	Groundout	Corbin Carroll grounds out, pitcher Cristopher...	Groundout	{'homeRunBallparks': 0}	2	1	2	1	X	In Play	CH	Changeup	In play, out(s)	X	hit_into_play	False	12	83.2	76.4	3.140000	1.480000	6.907456	0.452515	12	1933	1.045599	2.659124	1.871132	50.003868	5.889874	14.959764	23.784962	-33.967735	-5.102493	-121.073603	-0.612647	10.082693	-1.208386	-41.562701	42	↓	17	17	←	17.0	41.0	-1.0	61.0	60.8	19	.070	-7	0.0	125.30	0.241108	180.76	54.570326	Y	4	5	2	27	27-748544	748544	NaN	NaN	NaN	NaN	bottom	Philadelphia Phillies	PHI	Arizona Diamondbacks	AZ	2023-10-20	night	Mike Muchlinski	1	False	97.0	field_out	Groundout	-41.6	-17.0	9.6	22.8

An analysis of MLB umpire strike zones in 2023¶

The human definition of the strike zone¶

The computer definition of the strike zone¶

Data collection¶

Finding the list of 2023 games¶

Getting pitch data from all 2023 games¶

Data cleaning¶

Reorganize the columns¶

Only including taken pitches¶

Remove missing data¶

Exploratory data analysis¶

Overall umpire performance in 2023¶

Accuracy by pitch speed¶

Accuracy by pitch type¶

Heatmaps of called strikes¶

Umpires less likely to strike out batters¶

Analysis, hypothesis testing, & ML¶

Insight and policy decisions¶

	called	num_inc	accuracy	calls_above_avg
umpire_name
Lance Barksdale	129	2	0.984496	6
Todd Tichenor	134	4	0.970149	0
Bill Miller	158	7	0.955696	5
Brian O'Nora	133	6	0.954887	3
Brian Knight	296	14	0.952703	15

	called	num_inc	accuracy	calls_above_avg
umpire_name
Doug Eddings	280	31	0.889286	-10
Marvin Hudson	145	16	0.889655	-2
Jansen Visconti	131	14	0.893130	-3
Dan Iassogna	137	14	0.897810	-11
David Rackley	292	27	0.907534	-13