Bikash Santra
Indian Statistical Institute, Kolkata
Scipy is composed of task-specific sub-modules:
scipy.io - Data input and output
scipy.linalg - Linear algebra routines
scipy.interpolate - Interpolation
scipy.optimize - Optimization
scipy.stats - Statistics
scipy.cluster - Vector quantization / Kmeans
scipy.constants - Physical and mathematical constants
scipy.fftpack - Fourier transform
scipy.integrate - Integration routines
scipy.ndimage - n-dimensional image package
scipy.odr - Orthogonal distance regression
scipy.signal - Signal processing
scipy.sparse - Sparse matrices
scipy.spatial - Spatial data structures and algorithms
scipy.special - Any special mathematical functions
They all depend on numpy, but are mostly independent of each other. The standard way of importing Numpy and these Scipy modules is:
import numpy as np
from scipy import stats
import numpy as np
from scipy import io as spio
# Saving as matlab file
a = np.ones((3, 3))
spio.savemat('file.mat', {'a': a}) # savemat expects a dictionary
# Loading matlab file
data = spio.loadmat('file.mat')
data['a']
array([[1., 1., 1.], [1., 1., 1.], [1., 1., 1.]])
# Python / Matlab mismatches, eg matlab does not represent 1D arrays
a = np.ones(3)
print(a)
spio.savemat('file.mat', {'a': a})
data = spio.loadmat('file.mat')['a']
print(data)
[1. 1. 1.] [[1. 1. 1.]]
from scipy import linalg
arr = np.array([[1, 2],
[3, 4]])
linalg.det(arr)
-2.0
arr = np.array([[3, 2],
[6, 4]])
linalg.det(arr)
6.661338147750939e-16
linalg.det(np.ones((3, 4)))
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-11-06573727bd3f> in <module> ----> 1 linalg.det(np.ones((3, 4))) C:\ProgramData\Anaconda3\lib\site-packages\scipy\linalg\basic.py in det(a, overwrite_a, check_finite) 1018 a1 = _asarray_validated(a, check_finite=check_finite) 1019 if len(a1.shape) != 2 or a1.shape[0] != a1.shape[1]: -> 1020 raise ValueError('expected square matrix') 1021 overwrite_a = overwrite_a or _datacopied(a1, a) 1022 fdet, = get_flinalg_funcs(('det',), (a1,)) ValueError: expected square matrix
linalg.det(np.ones((3, 3)))
0.0
arr = np.array([[1, 2],
[3, 4]])
iarr = linalg.inv(arr)
print(iarr)
[[-2. 1. ] [ 1.5 -0.5]]
arr = np.array([[3, 2],
[6, 4]])
linalg.inv(arr)
--------------------------------------------------------------------------- LinAlgError Traceback (most recent call last) <ipython-input-17-ed8389f90f54> in <module> ----> 1 linalg.inv(arr) C:\ProgramData\Anaconda3\lib\site-packages\scipy\linalg\basic.py in inv(a, overwrite_a, check_finite) 961 inv_a, info = getri(lu, piv, lwork=lwork, overwrite_lu=1) 962 if info > 0: --> 963 raise LinAlgError("singular matrix") 964 if info < 0: 965 raise ValueError('illegal value in %d-th argument of internal ' LinAlgError: singular matrix
arr = np.arange(9).reshape((3, 3)) + np.diag([1, 0, 1])
print(arr)
[[1 1 2] [3 4 5] [6 7 9]]
uarr, spec, vharr = linalg.svd(arr)
print(uarr)
[[-0.1617463 -0.98659196 0.02178164] [-0.47456365 0.09711667 0.87484724] [-0.86523261 0.13116653 -0.48390895]]
print(spec)
[14.88982544 0.45294236 0.29654967]
print(vharr)
[[-0.45513179 -0.54511245 -0.70406496] [ 0.20258033 0.70658087 -0.67801525] [-0.86707339 0.45121601 0.21115836]]
# The original matrix can be re-composed by matrix multiplication of the outputs of svd with np.dot:
sarr = np.diag(spec)
svd_mat = uarr.dot(sarr).dot(vharr)
np.allclose(svd_mat, arr) # Elementwise equal or not?
True
from scipy.interpolate import interp1d
# Generate data
np.random.seed(0)
measured_time = np.linspace(0, 1, 10)
noise = 1e-1 * (np.random.random(10)*2 - 1)
measures = np.sin(2 * np.pi * measured_time) + noise
# Interpolate it to new time points
linear_interp = interp1d(measured_time, measures)
interpolation_time = np.linspace(0, 1, 50)
linear_results = linear_interp(interpolation_time)
cubic_interp = interp1d(measured_time, measures, kind='cubic')
cubic_results = cubic_interp(interpolation_time)
# Plot the data and the interpolation
from matplotlib import pyplot as plt
plt.figure(figsize=(6, 4))
plt.plot(measured_time, measures, 'o', ms=6, label='measures')
plt.plot(interpolation_time, linear_results, label='linear interp')
plt.plot(interpolation_time, cubic_results, label='cubic interp')
plt.legend()
plt.show()
#Demos a simple curve fitting
#First generate some data
# Seed the random number generator for reproducibility
np.random.seed(0)
x_data = np.linspace(-5, 5, num=50)
y_data = 2.9 * np.sin(1.5 * x_data) + np.random.normal(size=50)
# And plot it
plt.figure(figsize=(6, 4))
plt.scatter(x_data, y_data)
plt.show()
# Now fit a simple sine function to the data
from scipy import optimize
def test_func(x, a, b):
return a * np.sin(b * x)
params, params_covariance = optimize.curve_fit(test_func, x_data, y_data,
p0=[2, 2])
print(params)
[3.05931973 1.45754553]
# And plot the resulting curve on the data
plt.figure(figsize=(6, 4))
plt.scatter(x_data, y_data, label='Data')
plt.plot(x_data, test_func(x_data, params[0], params[1]),
label='Fitted function')
plt.legend(loc='best')
plt.show()
# Demos various methods to find the minimum of a function.
def f(x):
return x**2 + 10*np.sin(x)
x = np.arange(-10, 10, 0.1)
plt.plot(x, f(x))
plt.show()
# Now find the minimum with a few methods
from scipy import optimize
# The default (Nelder Mead)
print(optimize.minimize(f, x0=0))
fun: -7.945823375615215 hess_inv: array([[0.08589237]]) jac: array([-1.1920929e-06]) message: 'Optimization terminated successfully.' nfev: 12 nit: 5 njev: 6 status: 0 success: True x: array([-1.30644012])
# Methods: As the function is a smooth function, gradient-descent based methods are good options.
# The lBFGS algorithm is a good choice in general:
res = optimize.minimize(f, x0=0, method="L-BFGS-B")
print(res)
fun: array([-7.94582338]) hess_inv: <1x1 LbfgsInvHessProduct with dtype=float64> jac: array([-1.68753901e-06]) message: 'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL' nfev: 12 nit: 5 njev: 6 status: 0 success: True x: array([-1.30644017])
xMin = res.x
plt.plot(x, f(x))
plt.plot(xMin, f(xMin), 'o')
plt.show()
# Global minimum: A possible issue with this approach is that,
# if the function has local minima, the algorithm may find these
# local minima instead of the global minimum depending on the initial point x0:
res = optimize.minimize(f, x0=3, method="L-BFGS-B")
xMin = res.x
plt.plot(x, f(x))
plt.plot(xMin, f(xMin), 'o')
plt.show()
# If we don’t know the neighborhood of the global minimum to choose
# the initial point, we need to resort to costlier global optimization.
# To find the global minimum, we use scipy.optimize.basinhopping()
# (added in version 0.12.0 of Scipy).
# t combines a local optimizer with sampling of starting points:
res = optimize.basinhopping(f, 0)
xMin = res.x
plt.plot(x, f(x))
plt.plot(xMin, f(xMin), 'o')
plt.show()
# We can constrain the variable to the interval (0, 10) using the “bounds” argument:
res = optimize.minimize(f, x0=3, bounds=((0, 10), ))
xMin = res.x
plt.plot(x, f(x))
plt.plot(xMin, f(xMin), 'o')
plt.show()
def f(x):
return x**2 + 10*np.sin(x)
x = np.arange(-10, 10, 0.1)
plt.plot(x, f(x))
plt.show()
# Root finding
root = optimize.root(f, 1) # our initial guess is 1
print("First root found %s" % root.x)
root2 = optimize.root(f, -2.5)
print("Second root found %s" % root2.x)
First root found [0.] Second root found [-2.47948183]
# Find minima
# Global optimization
grid = (-10, 10, 0.1)
xmin_global = optimize.brute(f, (grid, ))
print("Global minima found %s" % xmin_global)
# Constrain optimization
xmin_local = optimize.fminbound(f, 0, 10)
print("Local minimum found %s" % xmin_local)
Global minima found [-1.30641113] Local minimum found 3.8374671194983834
# Plot function, minima, and roots
fig = plt.figure(figsize=(6, 4))
ax = fig.add_subplot(111)
# Plot the function
ax.plot(x, f(x), 'b-', label="f(x)")
# Plot the minima
xmins = np.array([xmin_global[0], xmin_local])
ax.plot(xmins, f(xmins), 'go', label="Minima")
# Plot the roots
roots = np.array([root.x, root2.x])
ax.plot(roots, f(roots), 'kv', label="Roots")
# Decorate the figure
ax.legend(loc='best')
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
ax.axhline(0, color='gray')
plt.show()
1) pandas, statsmodels, seaborn: https://www.scipy-lectures.org/packages/statistics/index.html#statistics
2) pyMC (for Baysian statistics): http://pymc-devs.github.io/pymc/tutorial.html
# Sample from a normal distribution using numpy's random number generator
samples = np.random.normal(size=10000)
# Compute a histogram of the sample
bins = np.linspace(-5, 5, 30)
histogram, bins = np.histogram(samples, bins=bins, normed=True)
bin_centers = 0.5*(bins[1:] + bins[:-1])
# Compute the PDF on the bin centers from scipy distribution object
from scipy import stats
pdf = stats.norm.pdf(bin_centers)
plt.figure(figsize=(6, 4))
plt.plot(bin_centers, histogram, label="Histogram of samples")
plt.plot(bin_centers, pdf, label="PDF")
plt.legend()
plt.show()
<ipython-input-42-77231af28dea>:6: VisibleDeprecationWarning: Passing `normed=True` on non-uniform bins has always been broken, and computes neither the probability density function nor the probability mass function. The result is only correct if the bins are uniform, when density=True will produce the same result anyway. The argument will be removed in a future version of numpy. histogram, bins = np.histogram(samples, bins=bins, normed=True)
mean, std = stats.norm.fit(samples)
print(mean,std)
-0.01824647525837795 0.9870225650686327
np.mean(samples)
-0.01824647525837795
np.median(samples)
-0.025087317353292057
# The median is also the percentile 50, because 50% of the observation are below it:
stats.scoreatpercentile(samples, 50)
-0.025087317353292057
# Similarly, we can calculate the percentile 90:
stats.scoreatpercentile(samples, 90)
1.2510702078093916
help(np.random.normal)
Help on built-in function normal: normal(...) method of numpy.random.mtrand.RandomState instance normal(loc=0.0, scale=1.0, size=None) Draw random samples from a normal (Gaussian) distribution. The probability density function of the normal distribution, first derived by De Moivre and 200 years later by both Gauss and Laplace independently [2]_, is often called the bell curve because of its characteristic shape (see the example below). The normal distributions occurs often in nature. For example, it describes the commonly occurring distribution of samples influenced by a large number of tiny, random disturbances, each with its own unique distribution [2]_. .. note:: New code should use the ``normal`` method of a ``default_rng()`` instance instead; please see the :ref:`random-quick-start`. Parameters ---------- loc : float or array_like of floats Mean ("centre") of the distribution. scale : float or array_like of floats Standard deviation (spread or "width") of the distribution. Must be non-negative. size : int or tuple of ints, optional Output shape. If the given shape is, e.g., ``(m, n, k)``, then ``m * n * k`` samples are drawn. If size is ``None`` (default), a single value is returned if ``loc`` and ``scale`` are both scalars. Otherwise, ``np.broadcast(loc, scale).size`` samples are drawn. Returns ------- out : ndarray or scalar Drawn samples from the parameterized normal distribution. See Also -------- scipy.stats.norm : probability density function, distribution or cumulative density function, etc. Generator.normal: which should be used for new code. Notes ----- The probability density for the Gaussian distribution is .. math:: p(x) = \frac{1}{\sqrt{ 2 \pi \sigma^2 }} e^{ - \frac{ (x - \mu)^2 } {2 \sigma^2} }, where :math:`\mu` is the mean and :math:`\sigma` the standard deviation. The square of the standard deviation, :math:`\sigma^2`, is called the variance. The function has its peak at the mean, and its "spread" increases with the standard deviation (the function reaches 0.607 times its maximum at :math:`x + \sigma` and :math:`x - \sigma` [2]_). This implies that normal is more likely to return samples lying close to the mean, rather than those far away. References ---------- .. [1] Wikipedia, "Normal distribution", https://en.wikipedia.org/wiki/Normal_distribution .. [2] P. R. Peebles Jr., "Central Limit Theorem" in "Probability, Random Variables and Random Signal Principles", 4th ed., 2001, pp. 51, 51, 125. Examples -------- Draw samples from the distribution: >>> mu, sigma = 0, 0.1 # mean and standard deviation >>> s = np.random.normal(mu, sigma, 1000) Verify the mean and the variance: >>> abs(mu - np.mean(s)) 0.0 # may vary >>> abs(sigma - np.std(s, ddof=1)) 0.1 # may vary Display the histogram of the samples, along with the probability density function: >>> import matplotlib.pyplot as plt >>> count, bins, ignored = plt.hist(s, 30, density=True) >>> plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) * ... np.exp( - (bins - mu)**2 / (2 * sigma**2) ), ... linewidth=2, color='r') >>> plt.show() Two-by-four array of samples from N(3, 6.25): >>> np.random.normal(3, 2.5, size=(2, 4)) array([[-4.49401501, 4.00950034, -1.81814867, 7.29718677], # random [ 0.39924804, 4.68456316, 4.99394529, 4.84057254]]) # random
# Generates 2 sets of observations
samples1 = np.random.normal(0, size=1000)
samples2 = np.random.normal(1, size=1000)
# Compute a histogram of the sample
bins = np.linspace(-4, 4, 30)
histogram1, bins = np.histogram(samples1, bins=bins, normed=True)
histogram2, bins = np.histogram(samples2, bins=bins, normed=True)
plt.figure(figsize=(6, 4))
plt.hist(samples1, bins=bins, normed=True, label="Samples 1")
plt.hist(samples2, bins=bins, normed=True, label="Samples 2")
plt.legend(loc='best')
plt.show()
<ipython-input-49-ef98de6e23f7>:7: VisibleDeprecationWarning: Passing `normed=True` on non-uniform bins has always been broken, and computes neither the probability density function nor the probability mass function. The result is only correct if the bins are uniform, when density=True will produce the same result anyway. The argument will be removed in a future version of numpy. histogram1, bins = np.histogram(samples1, bins=bins, normed=True) <ipython-input-49-ef98de6e23f7>:8: VisibleDeprecationWarning: Passing `normed=True` on non-uniform bins has always been broken, and computes neither the probability density function nor the probability mass function. The result is only correct if the bins are uniform, when density=True will produce the same result anyway. The argument will be removed in a future version of numpy. histogram2, bins = np.histogram(samples2, bins=bins, normed=True)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-49-ef98de6e23f7> in <module> 9 10 plt.figure(figsize=(6, 4)) ---> 11 plt.hist(samples1, bins=bins, normed=True, label="Samples 1") 12 plt.hist(samples2, bins=bins, normed=True, label="Samples 2") 13 plt.legend(loc='best') C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py in hist(x, bins, range, density, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, data, **kwargs) 2683 orientation='vertical', rwidth=None, log=False, color=None, 2684 label=None, stacked=False, *, data=None, **kwargs): -> 2685 return gca().hist( 2686 x, bins=bins, range=range, density=density, weights=weights, 2687 cumulative=cumulative, bottom=bottom, histtype=histtype, C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, data, *args, **kwargs) 1445 def inner(ax, *args, data=None, **kwargs): 1446 if data is None: -> 1447 return func(ax, *map(sanitize_sequence, args), **kwargs) 1448 1449 bound = new_sig.bind(ax, *args, **kwargs) C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py in hist(self, x, bins, range, density, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs) 6813 if patch: 6814 p = patch[0] -> 6815 p.update(kwargs) 6816 if lbl is not None: 6817 p.set_label(lbl) C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\artist.py in update(self, props) 994 func = getattr(self, f"set_{k}", None) 995 if not callable(func): --> 996 raise AttributeError(f"{type(self).__name__!r} object " 997 f"has no property {k!r}") 998 ret.append(func(v)) AttributeError: 'Rectangle' object has no property 'normed'
a) https://docs.scipy.org/doc/scipy/reference/tutorial/index.html
b) https://github.com/kuleshov/cs228-material/blob/master/tutorials/python/cs228-python-tutorial.ipynb
c) https://www.scipy-lectures.org/intro/scipy.html#scipy
d) https://www.scipy-lectures.org/