Scipy is composed of task-specific sub-modules:
scipy.io - Data input and output
scipy.linalg - Linear algebra routines
scipy.interpolate - Interpolation
scipy.optimize - Optimization
scipy.stats - Statistics
scipy.cluster - Vector quantization / Kmeans
scipy.constants - Physical and mathematical constants
scipy.fftpack - Fourier transform
scipy.integrate - Integration routines
scipy.ndimage - n-dimensional image package
scipy.odr - Orthogonal distance regression
scipy.signal - Signal processing
scipy.sparse - Sparse matrices
scipy.spatial - Spatial data structures and algorithms
scipy.special - Any special mathematical functions
They all depend on numpy, but are mostly independent of each other. The standard way of importing Numpy and these Scipy modules is:
import numpy as np
from scipy import stats
import numpy as np
from scipy import io as spio
import matplotlib.pyplot as plt
# Saving as matlab file
a = np.ones((3, 3))
spio.savemat('file.mat', {'a': a}) # savemat expects a dictionary
# Loading matlab file
data = spio.loadmat('file.mat')
data['a']
# Python / Matlab mismatches, eg matlab does not represent 1D arrays
a = np.ones(3)
print(a)
spio.savemat('file.mat', {'a': a})
data = spio.loadmat('file.mat')['a']
print(data)
from numpy import genfromtxt
my_data = genfromtxt('my_file.csv', delimiter=',')
import pandas as pd
df=pd.read_csv('myfile.csv', sep=',',header=None)
df.values
import pandas as pd
xls = pd.ExcelFile("yourfilename.xls")
sheetX = xls.parse(2) #2 is the sheet number
var1 = sheetX['ColumnName']
print(var1[1]) #1 is the row number...
import numpy
a = numpy.asarray([ [1,2,3], [4,5,6], [7,8,9] ])
numpy.savetxt("foo.csv", a, delimiter=",")
import numpy as np
a = np.asarray([ [1,2,3], [4,5,6], [7,8,9] ])
a.tofile('foo.csv',sep=',',format='%10.5f')
import pandas as pd
df = pd.DataFrame(np_array)
df.to_csv("file_path.csv")
import csv
with open('example.csv', newline='') as File:
reader = csv.reader(File)
for row in reader:
print(row)
import csv
myData = [["first_name", "second_name", "Grade"],
['Alex', 'Brian', 'A'],
['Tom', 'Smith', 'B']]
myFile = open('example2.csv', 'w')
with myFile:
writer = csv.writer(myFile)
writer.writerows(myData)
print("Writing complete")
from scipy import linalg
arr = np.array([[1, 2],
[3, 4]])
linalg.det(arr)
arr = np.array([[3, 2],
[6, 4]])
linalg.det(arr)
linalg.det(np.ones((3, 4)))
linalg.det(np.ones((3, 3)))
arr = np.array([[1, 2],
[3, 4]])
iarr = linalg.inv(arr)
print(iarr)
arr = np.array([[3, 2],
[6, 4]])
linalg.inv(arr)
arr = np.arange(12).reshape((3, 4)) + 1
print(arr)
U, s, V = linalg.svd(arr,full_matrices=False)
print(U.shape, s.shape, V.shape)
print(U)
print(s)
print(V)
# The original matrix can be re-composed by matrix multiplication of the outputs of svd with np.dot:
sarr = np.diag(s)
svd_mat = U.dot(sarr).dot(V)
np.allclose(svd_mat, arr) # Elementwise equal or not?
from scipy.interpolate import interp1d
# Generate data
np.random.seed(0)
measured_time = np.linspace(0, 1, 10)
noise = 1e-1 * (np.random.random(10)*2 - 1)
measures = np.sin(2 * np.pi * measured_time) + noise
# Interpolate it to new time points
linear_interp = interp1d(measured_time, measures)
interpolation_time = np.linspace(0, 1, 50)
linear_results = linear_interp(interpolation_time)
cubic_interp = interp1d(measured_time, measures, kind='cubic')
cubic_results = cubic_interp(interpolation_time)
# Plot the data and the interpolation
from matplotlib import pyplot as plt
plt.figure(figsize=(6, 4))
plt.plot(measured_time, measures, 'o', ms=6, label='measures')
plt.plot(interpolation_time, linear_results, label='linear interp')
plt.plot(interpolation_time, cubic_results, label='cubic interp')
plt.legend()
plt.show()
#Demos a simple curve fitting
#First generate some data
# Seed the random number generator for reproducibility
np.random.seed(0)
x_data = np.linspace(-5, 5, num=50)
y_data = 2.9 * np.sin(1.5 * x_data) + np.random.normal(size=50)
# And plot it
plt.figure(figsize=(6, 4))
plt.scatter(x_data, y_data)
plt.show()
# Now fit a simple sine function to the data
from scipy import optimize
def test_func(x, a, b):
return a * np.sin(b * x)
params, params_covariance = optimize.curve_fit(test_func, x_data, y_data,
p0=[2, 2])
print(params)
# And plot the resulting curve on the data
plt.figure(figsize=(6, 4))
plt.scatter(x_data, y_data, label='Data')
plt.plot(x_data, test_func(x_data, params[0], params[1]),
label='Fitted function')
plt.legend(loc='best')
plt.show()
# Demos various methods to find the minimum of a function.
def f(x):
return x**2 + 10*np.sin(x)
x = np.arange(-10, 10, 0.1)
plt.plot(x, f(x))
plt.show()
# Now find the minimum with a few methods
from scipy import optimize
# The default (Nelder Mead)
print(optimize.minimize(f, x0=0))
# Methods: As the function is a smooth function, gradient-descent based methods are good options.
# The lBFGS algorithm is a good choice in general:
res = optimize.minimize(f, x0=0, method="L-BFGS-B")
print(res)
xMin = res.x
plt.plot(x, f(x))
plt.plot(xMin, f(xMin), 'o')
plt.show()
# Global minimum: A possible issue with this approach is that,
# if the function has local minima, the algorithm may find these
# local minima instead of the global minimum depending on the initial point x0:
res = optimize.minimize(f, x0=3, method="L-BFGS-B")
xMin = res.x
plt.plot(x, f(x))
plt.plot(xMin, f(xMin), 'o')
plt.show()
# If we don’t know the neighborhood of the global minimum to choose
# the initial point, we need to resort to costlier global optimization.
# To find the global minimum, we use scipy.optimize.basinhopping()
# (added in version 0.12.0 of Scipy).
# t combines a local optimizer with sampling of starting points:
res = optimize.basinhopping(f, 0)
xMin = res.x
plt.plot(x, f(x))
plt.plot(xMin, f(xMin), 'o')
plt.show()
# We can constrain the variable to the interval (0, 10) using the “bounds” argument:
res = optimize.minimize(f, x0=3, bounds=((0, 10), ))
xMin = res.x
plt.plot(x, f(x))
plt.plot(xMin, f(xMin), 'o')
plt.show()
def f(x):
return x**2 + 10*np.sin(x)
x = np.arange(-10, 10, 0.1)
plt.plot(x, f(x))
plt.show()
# Root finding
root = optimize.root(f, 1) # our initial guess is 1
print("First root found %s" % root.x)
root2 = optimize.root(f, -2.5)
print("Second root found %s" % root2.x)
# Find minima
# Global optimization
grid = (-10, 10, 0.1)
xmin_global = optimize.brute(f, (grid, ))
print("Global minima found %s" % xmin_global)
# Constrain optimization
xmin_local = optimize.fminbound(f, 0, 10)
print("Local minimum found %s" % xmin_local)
# Plot function, minima, and roots
fig = plt.figure(figsize=(6, 4))
ax = fig.add_subplot(111)
# Plot the function
ax.plot(x, f(x), 'b-', label="f(x)")
# Plot the minima
xmins = np.array([xmin_global[0], xmin_local])
ax.plot(xmins, f(xmins), 'go', label="Minima")
# Plot the roots
roots = np.array([root.x, root2.x])
ax.plot(roots, f(roots), 'kv', label="Roots")
# Decorate the figure
ax.legend(loc='best')
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
ax.axhline(0, color='gray')
plt.show()
1) pandas, statsmodels, seaborn: https://www.scipy-lectures.org/packages/statistics/index.html#statistics
2) pyMC (for Baysian statistics): http://pymc-devs.github.io/pymc/tutorial.html
# Sample from a normal distribution using numpy's random number generator
samples = np.random.normal(size=10000)
# Compute a histogram of the sample
bins = np.linspace(-5, 5, 30)
histogram, bins = np.histogram(samples, bins=bins, normed=True)
bin_centers = 0.5*(bins[1:] + bins[:-1])
# Compute the PDF on the bin centers from scipy distribution object
from scipy import stats
pdf = stats.norm.pdf(bin_centers)
plt.figure(figsize=(6, 4))
plt.plot(bin_centers, histogram, label="Histogram of samples")
plt.plot(bin_centers, pdf, label="PDF")
plt.legend()
plt.show()
mean, std = stats.norm.fit(samples)
print(mean,std)
np.mean(samples)
np.median(samples)
# The median is also the percentile 50, because 50% of the observation are below it:
stats.scoreatpercentile(samples, 50)
# Similarly, we can calculate the percentile 90:
stats.scoreatpercentile(samples, 90)
# Generates 2 sets of observations
samples1 = np.random.normal(0, size=1000)
samples2 = np.random.normal(1, size=1000)
# Compute a histogram of the sample
bins = np.linspace(-4, 4, 30)
histogram1, bins = np.histogram(samples1, bins=bins, normed=True)
histogram2, bins = np.histogram(samples2, bins=bins, normed=True)
plt.figure(figsize=(6, 4))
plt.hist(samples1, bins=bins, normed=True, label="Samples 1")
plt.hist(samples2, bins=bins, normed=True, label="Samples 2")
plt.legend(loc='best')
plt.show()
from scipy import stats
np.random.seed(12345678)
x = np.random.random(10)
y = np.random.random(10)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
print("r-squared:", r_value**2)
plt.plot(x, y, 'o', label='original data')
plt.plot(x, intercept + slope*x, 'r', label='fitted line')
plt.legend()
plt.show()
a) https://docs.scipy.org/doc/scipy/reference/tutorial/index.html
b) https://github.com/kuleshov/cs228-material/blob/master/tutorials/python/cs228-python-tutorial.ipynb
c) https://www.scipy-lectures.org/intro/scipy.html#scipy
d) https://www.scipy-lectures.org/
Click here to download the source code of this tutorial.