High-level Scientific Computing: SciPy

Bikash Santra

Research Scholar, Indian Statistical Institute, Kolkata

SciPyÂ¶

Scipy is composed of task-specific sub-modules:

scipy.io - Data input and output
scipy.linalg - Linear algebra routines
scipy.interpolate - Interpolation
scipy.optimize - Optimization
scipy.stats - Statistics

scipy.cluster - Vector quantization / Kmeans
scipy.constants - Physical and mathematical constants
scipy.fftpack - Fourier transform
scipy.integrate - Integration routines
scipy.ndimage - n-dimensional image package
scipy.odr - Orthogonal distance regression
scipy.signal - Signal processing
scipy.sparse - Sparse matrices
scipy.spatial - Spatial data structures and algorithms
scipy.special - Any special mathematical functions

They all depend on numpy, but are mostly independent of each other. The standard way of importing Numpy and these Scipy modules is:
import numpy as np
from scipy import stats

File input/output: scipy.ioÂ¶

import numpy as np 
from scipy import io as spio
import matplotlib.pyplot as plt

# Saving as matlab file
a = np.ones((3, 3))
spio.savemat('file.mat', {'a': a}) # savemat expects a dictionary

# Loading matlab file
data = spio.loadmat('file.mat')
data['a']

# Python / Matlab mismatches, eg matlab does not represent 1D arrays
a = np.ones(3)
print(a)
spio.savemat('file.mat', {'a': a})
data = spio.loadmat('file.mat')['a']
print(data)

Linear algebra operations: scipy.linalgÂ¶

from scipy import linalg

Determinant of matricesÂ¶

arr = np.array([[1, 2],
                [3, 4]])

linalg.det(arr)

arr = np.array([[3, 2],
                [6, 4]])

linalg.det(arr)

linalg.det(np.ones((3, 4)))

linalg.det(np.ones((3, 3)))

Matrix inverseÂ¶

arr = np.array([[1, 2],
                [3, 4]])

iarr = linalg.inv(arr)
print(iarr)

arr = np.array([[3, 2],
                [6, 4]])

linalg.inv(arr)

Singular-value decomposition (SVD)Â¶

arr = np.arange(12).reshape((3, 4)) + 1
print(arr)

U, s, V = linalg.svd(arr,full_matrices=False)
print(U.shape, s.shape, V.shape)

print(U)

print(s)

print(V)

# The original matrix can be re-composed by matrix multiplication of the outputs of svd with np.dot:
sarr = np.diag(s)
svd_mat = U.dot(sarr).dot(V)
np.allclose(svd_mat, arr) # Elementwise equal or not?

Interpolation: scipy.interpolateÂ¶

from scipy.interpolate import interp1d

# Generate data
np.random.seed(0)
measured_time = np.linspace(0, 1, 10)
noise = 1e-1 * (np.random.random(10)*2 - 1)
measures = np.sin(2 * np.pi * measured_time) + noise

# Interpolate it to new time points
linear_interp = interp1d(measured_time, measures)
interpolation_time = np.linspace(0, 1, 50)
linear_results = linear_interp(interpolation_time)
cubic_interp = interp1d(measured_time, measures, kind='cubic')
cubic_results = cubic_interp(interpolation_time)

# Plot the data and the interpolation
from matplotlib import pyplot as plt
plt.figure(figsize=(6, 4))
plt.plot(measured_time, measures, 'o', ms=6, label='measures')
plt.plot(interpolation_time, linear_results, label='linear interp')
plt.plot(interpolation_time, cubic_results, label='cubic interp')
plt.legend()
plt.show()

Optimization and fit: scipy.optimizeÂ¶

Curve fittingÂ¶

#Demos a simple curve fitting

#First generate some data

# Seed the random number generator for reproducibility
np.random.seed(0)

x_data = np.linspace(-5, 5, num=50)
y_data = 2.9 * np.sin(1.5 * x_data) + np.random.normal(size=50)

# And plot it
plt.figure(figsize=(6, 4))
plt.scatter(x_data, y_data)
plt.show()

# Now fit a simple sine function to the data

from scipy import optimize

def test_func(x, a, b):
    return a * np.sin(b * x)

params, params_covariance = optimize.curve_fit(test_func, x_data, y_data,
                                               p0=[2, 2])

print(params)

# And plot the resulting curve on the data
plt.figure(figsize=(6, 4))
plt.scatter(x_data, y_data, label='Data')
plt.plot(x_data, test_func(x_data, params[0], params[1]),
         label='Fitted function')

plt.legend(loc='best')

plt.show()

Finding the minimum of a smooth functionÂ¶

# Demos various methods to find the minimum of a function.
def f(x):
    return x**2 + 10*np.sin(x)


x = np.arange(-10, 10, 0.1)
plt.plot(x, f(x))
plt.show()

# Now find the minimum with a few methods
from scipy import optimize

# The default (Nelder Mead)
print(optimize.minimize(f, x0=0))

# Methods: As the function is a smooth function, gradient-descent based methods are good options. 
# The lBFGS algorithm is a good choice in general:
res = optimize.minimize(f, x0=0, method="L-BFGS-B")
print(res)

xMin = res.x

plt.plot(x, f(x))
plt.plot(xMin, f(xMin), 'o')
plt.show()

# Global minimum: A possible issue with this approach is that, 
# if the function has local minima, the algorithm may find these 
# local minima instead of the global minimum depending on the initial point x0:
res = optimize.minimize(f, x0=3, method="L-BFGS-B")
xMin = res.x

plt.plot(x, f(x))
plt.plot(xMin, f(xMin), 'o')
plt.show()

# If we don’t know the neighborhood of the global minimum to choose
# the initial point, we need to resort to costlier global optimization.
# To find the global minimum, we use scipy.optimize.basinhopping()
# (added in version 0.12.0 of Scipy). 
# t combines a local optimizer with sampling of starting points:

res = optimize.basinhopping(f, 0)

xMin = res.x

plt.plot(x, f(x))
plt.plot(xMin, f(xMin), 'o')
plt.show()

# We can constrain the variable to the interval (0, 10) using the “bounds” argument:
res = optimize.minimize(f, x0=3, bounds=((0, 10), ))
xMin = res.x

plt.plot(x, f(x))
plt.plot(xMin, f(xMin), 'o')
plt.show()

Finding the roots of a scalar functionÂ¶

def f(x):
    return x**2 + 10*np.sin(x)


x = np.arange(-10, 10, 0.1)
plt.plot(x, f(x))
plt.show()

# Root finding
root = optimize.root(f, 1)  # our initial guess is 1
print("First root found %s" % root.x)
root2 = optimize.root(f, -2.5)
print("Second root found %s" % root2.x)

# Find minima
# Global optimization
grid = (-10, 10, 0.1)
xmin_global = optimize.brute(f, (grid, ))
print("Global minima found %s" % xmin_global)

# Constrain optimization
xmin_local = optimize.fminbound(f, 0, 10)
print("Local minimum found %s" % xmin_local)

# Plot function, minima, and roots
fig = plt.figure(figsize=(6, 4))
ax = fig.add_subplot(111)

# Plot the function
ax.plot(x, f(x), 'b-', label="f(x)")

# Plot the minima
xmins = np.array([xmin_global[0], xmin_local])
ax.plot(xmins, f(xmins), 'go', label="Minima")

# Plot the roots
roots = np.array([root.x, root2.x])
ax.plot(roots, f(roots), 'kv', label="Roots")

# Decorate the figure
ax.legend(loc='best')
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
ax.axhline(0, color='gray')
plt.show()

Statistics and random numbers: scipy.statsÂ¶

Statistical Analysis using PythonÂ¶

1) pandas, statsmodels, seaborn: https://www.scipy-lectures.org/packages/statistics/index.html#statistics
2) pyMC (for Baysian statistics): http://pymc-devs.github.io/pymc/tutorial.html

Distributions: histogram and probability density functionÂ¶

# Sample from a normal distribution using numpy's random number generator
samples = np.random.normal(size=10000)

# Compute a histogram of the sample
bins = np.linspace(-5, 5, 30)
histogram, bins = np.histogram(samples, bins=bins, normed=True)

bin_centers = 0.5*(bins[1:] + bins[:-1])

# Compute the PDF on the bin centers from scipy distribution object
from scipy import stats
pdf = stats.norm.pdf(bin_centers)

plt.figure(figsize=(6, 4))
plt.plot(bin_centers, histogram, label="Histogram of samples")
plt.plot(bin_centers, pdf, label="PDF")
plt.legend()
plt.show()

mean, std = stats.norm.fit(samples)
print(mean,std)

Mean, median and percentilesÂ¶

np.mean(samples)

np.median(samples)

# The median is also the percentile 50, because 50% of the observation are below it:
stats.scoreatpercentile(samples, 50)

# Similarly, we can calculate the percentile 90:
stats.scoreatpercentile(samples, 90)

Comparing 2 sets of samples from GaussiansÂ¶

# Generates 2 sets of observations
samples1 = np.random.normal(0, size=1000)
samples2 = np.random.normal(1, size=1000)

# Compute a histogram of the sample
bins = np.linspace(-4, 4, 30)
histogram1, bins = np.histogram(samples1, bins=bins, normed=True)
histogram2, bins = np.histogram(samples2, bins=bins, normed=True)

plt.figure(figsize=(6, 4))
plt.hist(samples1, bins=bins, normed=True, label="Samples 1")
plt.hist(samples2, bins=bins, normed=True, label="Samples 2")
plt.legend(loc='best')
plt.show()

Linear RegressionÂ¶

from scipy import stats

np.random.seed(12345678)
x = np.random.random(10)
y = np.random.random(10)

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
print("r-squared:", r_value**2)

plt.plot(x, y, 'o', label='original data')
plt.plot(x, intercept + slope*x, 'r', label='fitted line')
plt.legend()
plt.show()

ReferencesÂ¶

a) https://docs.scipy.org/doc/scipy/reference/tutorial/index.html
b) https://github.com/kuleshov/cs228-material/blob/master/tutorials/python/cs228-python-tutorial.ipynb
c) https://www.scipy-lectures.org/intro/scipy.html#scipy
d) https://www.scipy-lectures.org/

Click here to download the source code of this tutorial.