Instructions
Requirements and Specifications
Source Code
import numpy as np
import matplotlib.pyplot as plt
from scipy import interpolate
import pandas as pd
## Define Data
x = np.linspace(1790,2000, 22)
# y is in thousands
y = np.array([3929, 5308, 7240, 9638, 12866, 17069, 23192, 31443, 38558, 50156, 62948, 75995, 91972, 105711, 122755, 131669, 150697, 179323, 203212, 226505, 249709.873, 284416])
y = y*1E3
## First, let's normalize the data so the calculation of the coefficients will be more accurate
xn = (x-x.min())/(x.max() - x.min())
yn = (y-y.min())/(y.max() - y.min())
print(xn[:5])
print(yn[:5])
#Model 1: Interpolating Polynomial using all $n$ points and creating a polynomial of degree $n-1$
It's being asked to fit the data to a polynomial of the order $n-$ where $n$ is the number of points in the data
n = len(x)
print(f"There are {n} data points in the dataset so the polynomial will be of degree {n-1}")
The polynomial of degree n-1 will be of the form:
$y_{fit} = k_{0} + k_{1}x + k_{2}x^{2} + k_{3}x^{3} + ... + k_{n-1}x^{n-1}$
## Calculate coefficients
### The coefficients $k$ are calculated using the following equation
$k=(A^{T}A)^{-1}A^{T}b$
### Where A is the matrix containing the terms multiplying the coefficiends $k_{i}$ to be found. Then, given a column-vector $x$ containing all the x-points, the matrix A is:
$A=[1, x, x^{2}, x^{3}, x^{4}, ... , x^{n-1}]$
### While the vector $b$ is just a column vector containing the y-points:
$b = y$
### Create Matrix A
A = np.zeros((n,n))
A[:,0] = np.ones(n)
for i in range(1, n):
A[:,i] = x**i
### Define vector b
b = y[:, np.newaxis] #convert to column vector
### Find coefficients
k = np.dot(np.dot(np.linalg.pinv(np.dot(A.T, A)), A.T), b)
## Coefficiens Obtained
k
## Finally, calculate the fitted values
### Function that will help me to calculate the y-value given a x-value or an array of x-values
def f(x, k):
"""
x: single value or array of values
k: array containing the coefficients of the polynomial
"""
y_fit = k[0]*np.power(x, 0)
for i in range(1,len(k)):
y_fit += k[i]*np.power(x, i)
return y_fit
x_new = np.arange(x[0], x[-1]+1, 1)
y_fit = f(x_new, k)
## Now Plot
plt.figure()
plt.plot(x, y, 'r.', label = 'Data')
plt.plot(x_new, y_fit, label = 'Fitted')
plt.grid(True)
plt.legend()
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()
### Population in 1968:
x_val = 1968.0
print(f"The population in {x_val} given the fitted polynomial of degree {n} is: {f(x_val,k)[0]}")
### Population in 1999
x_val = 1999.0
print(f"The population in {x_val} given the fitted polynomial of degree {n} is: {f(x_val,k)[0]}")
### Population in 2020
Note: The actual population at 2020 in USA according to: https://www.census.gov/quickfacts/fact/table/US/PST045219 was: 331,002,651
x_val = 2020.0
print(f"The population in {x_val} given the fitted polynomial of degree {n} is: {f(x_val,k)[0]}")
## Model 2: Using Newton's Divided Difference
We see that the calculated model is not bad, since the values with respect to the original dataset are very similar (see graphic above), but when interpolating (Estimate in a future value), the value obtained is above the real one.
## Calculate Newton's Divided Difference coefficients
def calculate_DDT(x, y):
"""
This function calculates Newton's Divided Difference Table of coefficients
x: x-avalues
y: y-values
"""
n = len(y)
coef = np.zeros([n,n])
# The first column in the coefficients i y
coef[:,0] = y
for j in range(1,n):
for i in range(n-j):
# Calculate
xji = x[i+j]-x[i]
yji = coef[i+1][j-1] - coef[i][j-1]
coef[i][j] = yji/xji
return coef
table = calculate_DDT(x,y)
### Now, check the coefficients in the table and pick the degree based on the coefficients if the magnitude of all coefficients is less than 0.01
optimal_degree = n-1
for i in range(n):
row = table[i,1:]
row = np.abs(row)
max_coeff = np.max(row)
if max_coeff < 0.01:
print(f"Optimal degree for row {i}")
optimal_degree = i+1
break
# Build a table
df = pd.DataFrame(data = table)
df.head(len(df))
coeffs = table[0,:optimal_degree]
degree = len(coeffs)
print(f"The Newton's Divided Diff. degree is {degree}")
coeffs
## Now calculate Newton's Divided Difference Polynomial
def poly(a, x_data, x):
"""
a: array containing the Newton's Divided Diff coefficients
x_data: array containing the original nada
x: Array containing the values to be evaluated in the polynomial
"""
n = len(x_data) - 1
yfit = a[n]
for k in range(1,n+1):
yfit = a[n-k] + (x -x_data[n-k])*yfit
return yfit
# Define x values
x_new = np.arange(x[0], x[-1]+1, 1)
a = coeffs
y_fit2 = poly(a, x, x_new)
plt.figure()
plt.plot(x, y, 'r.', label = 'Data')
plt.plot(x_new, y_fit2, label = 'Fit')
plt.grid(True)
plt.legend()
plt.show()
### Population in 1968
x_val = 1968.0
print(f"The population in {x_val} given the Divided Difference Polynomial of degree {optimal_degree} is: {poly(a, x, x_val)}")
### Population in 1999
x_val = 1999.0
print(f"The population in {x_val} given the Divided Difference Polynomial of degree {optimal_degree} is: {poly(a, x, x_val)}")
### Population in 2020
x_val =2020.0
print(f"The population in {x_val} given the Divided Difference Polynomial of degree {optimal_degree} is: {poly(a, x, x_val)}")
Now, we see that the polynomial is godd for intermediate values (values in the range of the original data), but for extrapolation, the model is bad because it even predicted a negative value for 2020