PageRenderTime 57ms CodeModel.GetById 28ms RepoModel.GetById 1ms app.codeStats 0ms

/examples/inspection/plot_partial_dependence.py

https://github.com/NelleV/scikit-learn
Python | 287 lines | 264 code | 0 blank | 23 comment | 5 complexity | f48fc47375a60f0284f1663b8c548ca5 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. """
  2. ===============================================================
  3. Partial Dependence and Individual Conditional Expectation Plots
  4. ===============================================================
  5. Partial dependence plots show the dependence between the target function [2]_
  6. and a set of features of interest, marginalizing over the values of all other
  7. features (the complement features). Due to the limits of human perception, the
  8. size of the set of features of interest must be small (usually, one or two)
  9. thus they are usually chosen among the most important features.
  10. Similarly, an individual conditional expectation (ICE) plot [3]_
  11. shows the dependence between the target function and a feature of interest.
  12. However, unlike partial dependence plots, which show the average effect of the
  13. features of interest, ICE plots visualize the dependence of the prediction on a
  14. feature for each :term:`sample` separately, with one line per sample.
  15. Only one feature of interest is supported for ICE plots.
  16. This example shows how to obtain partial dependence and ICE plots from a
  17. :class:`~sklearn.neural_network.MLPRegressor` and a
  18. :class:`~sklearn.ensemble.HistGradientBoostingRegressor` trained on the
  19. California housing dataset. The example is taken from [1]_.
  20. .. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical
  21. Learning Ed. 2", Springer, 2009.
  22. .. [2] For classification you can think of it as the regression score before
  23. the link function.
  24. .. [3] Goldstein, A., Kapelner, A., Bleich, J., and Pitkin, E., Peeking Inside
  25. the Black Box: Visualizing Statistical Learning With Plots of
  26. Individual Conditional Expectation. (2015) Journal of Computational and
  27. Graphical Statistics, 24(1): 44-65 (https://arxiv.org/abs/1309.6392)
  28. """
  29. print(__doc__)
  30. # %%
  31. # California Housing data preprocessing
  32. # -------------------------------------
  33. #
  34. # Center target to avoid gradient boosting init bias: gradient boosting
  35. # with the 'recursion' method does not account for the initial estimator
  36. # (here the average target, by default).
  37. import pandas as pd
  38. from sklearn.datasets import fetch_california_housing
  39. from sklearn.model_selection import train_test_split
  40. cal_housing = fetch_california_housing()
  41. X = pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names)
  42. y = cal_housing.target
  43. y -= y.mean()
  44. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
  45. # %%
  46. # 1-way partial dependence with different models
  47. # ----------------------------------------------
  48. #
  49. # In this section, we will compute 1-way partial dependence with two different
  50. # machine-learning models: (i) a multi-layer perceptron and (ii) a
  51. # gradient-boosting. With these two models, we illustrate how to compute and
  52. # interpret both partial dependence plot (PDP) and individual conditional
  53. # expectation (ICE).
  54. #
  55. # Multi-layer perceptron
  56. # ......................
  57. #
  58. # Let's fit a :class:`~sklearn.neural_network.MLPRegressor` and compute
  59. # single-variable partial dependence plots.
  60. from time import time
  61. from sklearn.pipeline import make_pipeline
  62. from sklearn.preprocessing import QuantileTransformer
  63. from sklearn.neural_network import MLPRegressor
  64. print("Training MLPRegressor...")
  65. tic = time()
  66. est = make_pipeline(
  67. QuantileTransformer(),
  68. MLPRegressor(
  69. hidden_layer_sizes=(50, 50), learning_rate_init=0.01, early_stopping=True
  70. ),
  71. )
  72. est.fit(X_train, y_train)
  73. print(f"done in {time() - tic:.3f}s")
  74. print(f"Test R2 score: {est.score(X_test, y_test):.2f}")
  75. # %%
  76. # We configured a pipeline to scale the numerical input features and tuned the
  77. # neural network size and learning rate to get a reasonable compromise between
  78. # training time and predictive performance on a test set.
  79. #
  80. # Importantly, this tabular dataset has very different dynamic ranges for its
  81. # features. Neural networks tend to be very sensitive to features with varying
  82. # scales and forgetting to preprocess the numeric feature would lead to a very
  83. # poor model.
  84. #
  85. # It would be possible to get even higher predictive performance with a larger
  86. # neural network but the training would also be significantly more expensive.
  87. #
  88. # Note that it is important to check that the model is accurate enough on a
  89. # test set before plotting the partial dependence since there would be little
  90. # use in explaining the impact of a given feature on the prediction function of
  91. # a poor model.
  92. #
  93. # We will plot the partial dependence, both individual (ICE) and averaged one
  94. # (PDP). We limit to only 50 ICE curves to not overcrowd the plot.
  95. import matplotlib.pyplot as plt
  96. from sklearn.inspection import partial_dependence
  97. from sklearn.inspection import plot_partial_dependence
  98. print("Computing partial dependence plots...")
  99. tic = time()
  100. features = ["MedInc", "AveOccup", "HouseAge", "AveRooms"]
  101. display = plot_partial_dependence(
  102. est,
  103. X_train,
  104. features,
  105. kind="both",
  106. subsample=50,
  107. n_jobs=3,
  108. grid_resolution=20,
  109. random_state=0,
  110. ice_lines_kw={"color": "tab:blue", "alpha": 0.2, "linewidth": 0.5},
  111. pd_line_kw={"color": "tab:orange", "linestyle": "--"},
  112. )
  113. print(f"done in {time() - tic:.3f}s")
  114. display.figure_.suptitle(
  115. "Partial dependence of house value on non-location features\n"
  116. "for the California housing dataset, with MLPRegressor"
  117. )
  118. display.figure_.subplots_adjust(hspace=0.3)
  119. # %%
  120. # Gradient boosting
  121. # .................
  122. #
  123. # Let's now fit a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` and
  124. # compute the partial dependence on the same features.
  125. from sklearn.ensemble import HistGradientBoostingRegressor
  126. print("Training HistGradientBoostingRegressor...")
  127. tic = time()
  128. est = HistGradientBoostingRegressor()
  129. est.fit(X_train, y_train)
  130. print(f"done in {time() - tic:.3f}s")
  131. print(f"Test R2 score: {est.score(X_test, y_test):.2f}")
  132. # %%
  133. # Here, we used the default hyperparameters for the gradient boosting model
  134. # without any preprocessing as tree-based models are naturally robust to
  135. # monotonic transformations of numerical features.
  136. #
  137. # Note that on this tabular dataset, Gradient Boosting Machines are both
  138. # significantly faster to train and more accurate than neural networks. It is
  139. # also significantly cheaper to tune their hyperparameters (the defaults tend
  140. # to work well while this is not often the case for neural networks).
  141. #
  142. # We will plot the partial dependence, both individual (ICE) and averaged one
  143. # (PDP). We limit to only 50 ICE curves to not overcrowd the plot.
  144. print("Computing partial dependence plots...")
  145. tic = time()
  146. display = plot_partial_dependence(
  147. est,
  148. X_train,
  149. features,
  150. kind="both",
  151. subsample=50,
  152. n_jobs=3,
  153. grid_resolution=20,
  154. random_state=0,
  155. ice_lines_kw={"color": "tab:blue", "alpha": 0.2, "linewidth": 0.5},
  156. pd_line_kw={"color": "tab:orange", "linestyle": "--"},
  157. )
  158. print(f"done in {time() - tic:.3f}s")
  159. display.figure_.suptitle(
  160. "Partial dependence of house value on non-location features\n"
  161. "for the California housing dataset, with Gradient Boosting"
  162. )
  163. display.figure_.subplots_adjust(wspace=0.4, hspace=0.3)
  164. # %%
  165. # Analysis of the plots
  166. # .....................
  167. #
  168. # We can clearly see on the PDPs (thick blue line) that the median house price
  169. # shows a linear relationship with the median income (top left) and that the
  170. # house price drops when the average occupants per household increases (top
  171. # middle). The top right plot shows that the house age in a district does not
  172. # have a strong influence on the (median) house price; so does the average
  173. # rooms per household.
  174. #
  175. # The ICE curves (light blue lines) complement the analysis: we can see that
  176. # there are some exceptions, where the house price remain constant with median
  177. # income and average occupants. On the other hand, while the house age (top
  178. # right) does not have a strong influence on the median house price on average,
  179. # there seems to be a number of exceptions where the house price increase when
  180. # between the ages 15-25. Similar exceptions can be observed for the average
  181. # number of rooms (bottom left). Therefore, ICE plots show some individual
  182. # effect which are attenuated by taking the averages.
  183. #
  184. # In all plots, the tick marks on the x-axis represent the deciles of the
  185. # feature values in the training data.
  186. #
  187. # We also observe that :class:`~sklearn.neural_network.MLPRegressor` has much
  188. # smoother predictions than
  189. # :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.
  190. #
  191. # However, it is worth noting that we are creating potential meaningless
  192. # synthetic samples if features are correlated.
  193. # %%
  194. # 2D interaction plots
  195. # --------------------
  196. #
  197. # PDPs with two features of interest enable us to visualize interactions among
  198. # them. However, ICEs cannot be plotted in an easy manner and thus interpreted.
  199. # Another consideration is linked to the performance to compute the PDPs. With
  200. # the tree-based algorithm, when only PDPs are requested, they can be computed
  201. # on an efficient way using the `'recursion'` method.
  202. features = ["AveOccup", "HouseAge", ("AveOccup", "HouseAge")]
  203. print("Computing partial dependence plots...")
  204. tic = time()
  205. _, ax = plt.subplots(ncols=3, figsize=(9, 4))
  206. display = plot_partial_dependence(
  207. est,
  208. X_train,
  209. features,
  210. kind="average",
  211. n_jobs=3,
  212. grid_resolution=20,
  213. ax=ax,
  214. )
  215. print(f"done in {time() - tic:.3f}s")
  216. display.figure_.suptitle(
  217. "Partial dependence of house value on non-location features\n"
  218. "for the California housing dataset, with Gradient Boosting"
  219. )
  220. display.figure_.subplots_adjust(wspace=0.4, hspace=0.3)
  221. # %%
  222. # The two-way partial dependence plot shows the dependence of median house
  223. # price on joint values of house age and average occupants per household. We
  224. # can clearly see an interaction between the two features: for an average
  225. # occupancy greater than two, the house price is nearly independent of the
  226. # house age, whereas for values less than two there is a strong dependence on
  227. # age.
  228. #
  229. # 3D interaction plots
  230. # --------------------
  231. #
  232. # Let's make the same partial dependence plot for the 2 features interaction,
  233. # this time in 3 dimensions.
  234. import numpy as np
  235. from mpl_toolkits.mplot3d import Axes3D
  236. fig = plt.figure()
  237. features = ("AveOccup", "HouseAge")
  238. pdp = partial_dependence(
  239. est, X_train, features=features, kind="average", grid_resolution=20
  240. )
  241. XX, YY = np.meshgrid(pdp["values"][0], pdp["values"][1])
  242. Z = pdp.average[0].T
  243. ax = Axes3D(fig)
  244. fig.add_axes(ax)
  245. surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor="k")
  246. ax.set_xlabel(features[0])
  247. ax.set_ylabel(features[1])
  248. ax.set_zlabel("Partial dependence")
  249. # pretty init view
  250. ax.view_init(elev=22, azim=122)
  251. plt.colorbar(surf)
  252. plt.suptitle(
  253. "Partial dependence of house value on median\n"
  254. "age and average occupancy, with Gradient Boosting"
  255. )
  256. plt.subplots_adjust(top=0.9)
  257. plt.show()