Skip to content

Commit a5c5f95

Browse files
committed
done
1 parent 3d1c6be commit a5c5f95

25 files changed

+1087
-1
lines changed

.gitattributes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Auto detect text files and perform LF normalization
2+
* text=auto

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*.pyc
2+
.DS_Store

Makefile

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
help:
2+
@echo "make regression : For running Gradient Descent Regression model on random dataset"
3+
@echo "make polynomial_features : For testing tranformation built similar to sklearn’s polynomial preprocessing"
4+
@echo "make normal_regression : To check how theta vary with degree"
5+
@echo "make poly_theta : To check how theta vary with degree in polynomial features"
6+
@echo "make contour : For generating a contour of the gradient descent"
7+
@echo "make compare_time : For comparing time taken by normal regression and gradient descent"
8+
@echo "make collinear : For checking the feature dependency (collinear features)"
9+
10+
regression:
11+
@ python linear_regression_test.py
12+
13+
polynomial_features:
14+
@ python poly_features_test.py
15+
16+
normal_regression:
17+
@ python Normal_regression.py
18+
19+
poly_theta:
20+
@ python degreevstheta.py
21+
22+
contour:
23+
@ python plot_contour.py
24+
25+
compare_time:
26+
@ python compare_time.py
27+
28+
collinear:
29+
@ python collinear_dataset.py

Normal_regression.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import numpy as np
2+
import matplotlib.pyplot as plt
3+
from preprocessing.polynomial_features import PolynomialFeatures
4+
5+
x = np.array([i*np.pi/180 for i in range(60,300,4)])
6+
np.random.seed(10) #Setting seed for reproducibility
7+
y = 4*x + 7 + np.random.normal(0,3,len(x))
8+
# print(x)
9+
10+
def normal_regression(X,y):
11+
X_transpose = np.transpose(X)
12+
A = np.linalg.inv(X_transpose.dot(X))
13+
B = X_transpose.dot(y)
14+
return A.dot(B)
15+
16+
arr_norm = []
17+
degrees = [i+1 for i in range(9)]
18+
x = np.array(np.matrix(x).transpose())
19+
20+
include_bias = True
21+
for degree in degrees:
22+
poly = PolynomialFeatures(degree,include_bias = include_bias)
23+
X = poly.transform(x)
24+
coeff = normal_regression(X,y)
25+
arr_norm.append(np.linalg.norm(coeff))
26+
27+
plt.plot(degrees, arr_norm)
28+
plt.xlabel("Degree of the polynomial")
29+
plt.ylabel("Magnitude of coefficient (theta)")
30+
plt.savefig('./images/q5plot.png')
31+
plt.show()

README.md

Lines changed: 143 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,143 @@
1-
# ML-Linear-Regression-from-scratch
1+
# Linear Regression ⭐⭐
2+
3+
## Directory Structure 📁
4+
5+
```
6+
│ collinear_dataset.py
7+
│ compare_time.py
8+
│ contour_plot.gif
9+
│ degreevstheta.py
10+
│ gif1.gif
11+
│ gif2.gif
12+
│ linear_regression_test.py
13+
│ line_plot.gif
14+
│ Makefile
15+
│ metrics.py
16+
│ Normal_regression.py
17+
│ plot_contour.py
18+
│ poly_features_test.py
19+
│ README.md
20+
│ surface_plot.gif
21+
22+
├───images
23+
│ q5plot.png
24+
│ q6plot.png
25+
│ q8features.png
26+
│ q8samples.png
27+
28+
├───linearRegression
29+
│ │ linearRegression.py
30+
│ │ __init__.py
31+
│ │
32+
│ └───__pycache__
33+
│ linearRegression.cpython-37.pyc
34+
│ __init__.cpython-37.pyc
35+
36+
├───preprocessing
37+
│ │ polynomial_features.py
38+
│ │ __init__.py
39+
│ │
40+
│ └───__pycache__
41+
│ polynomial_features.cpython-37.pyc
42+
│ __init__.cpython-37.pyc
43+
44+
├───temp_images
45+
└───__pycache__
46+
metrics.cpython-37.pyc
47+
```
48+
49+
## Instructions to run 🏃
50+
51+
```make help```<br>
52+
```make regression```<br>
53+
```make polynomial_features```<br>
54+
```make normal_regression```<br>
55+
```make poly_theta```<br>
56+
```make contour```<br>
57+
```make compare_time```<br>
58+
```make collinear```<br>
59+
60+
## Stochastic GD (Batch size = 1) ☝️
61+
62+
- Learning rate type = constant
63+
RMSE: 0.9119624181584616
64+
MAE: 0.7126923090787688
65+
66+
- Learning rate type = inverse
67+
RMSE: 0.9049599308106121
68+
MAE: 0.7098334683036919
69+
70+
## Vanilla GD (Batch size = N) ✋
71+
72+
- Learning rate type = constant
73+
RMSE: 0.9069295672718122
74+
MAE: 0.7108301179089876
75+
76+
- Learning rate type = inverse
77+
RMSE: 0.9607329070540364
78+
MAE: 0.7641616657610887
79+
80+
## Mini Batch GD (Batch size between 1 and N(5)) 🤘
81+
82+
- Learning rate type = constant
83+
RMSE: 0.9046502501334435
84+
MAE: 0.7102161700019564
85+
86+
- Learning rate type = inverse
87+
RMSE: 0.9268357442221973
88+
MAE: 0.7309246821952116
89+
90+
## Polynomial Feature Transformation 🔰
91+
92+
- The output [[1, 2]] is [[1, 1, 2, 1, 2, 4]]
93+
94+
- The output for [[1, 2, 3]] is [[1, 1, 2, 3, 1, 2, 3, 4, 6, 9]]
95+
96+
- The outputs are similar to sklearn's PolynomialFeatures fit transform
97+
98+
## Theta vs degree 📈
99+
100+
![alt text](images/q5plot.png?raw=true)
101+
102+
- Conclusion - As the degree of the polynomial increases, the norm of theta increases because of overfitting.
103+
104+
## L2 Norm of Theta vs Degree of Polynomial for varying Sample size 📈
105+
106+
![alt text](images/q6plot.png?raw=true)
107+
108+
**Conclusion**
109+
110+
- As the degree increases magnitude of theta increases due to overfitting of data.
111+
- But at the same degree, as the number of samples increases, the magnitude of theta decreases because more samples reduce the overfitting to some extent.
112+
113+
## Linear Regression line fit 🔥
114+
![alt text](line_plot.gif?raw=true)
115+
116+
## Linear Regression Surface plot 🔥
117+
![alt text](surface_plot.gif?raw=true)
118+
119+
## Linear Regression Contour plot 🔥
120+
![alt text](contour_plot.gif?raw=true)
121+
122+
## Time Complexities ⏳
123+
124+
- Theoretical time complexity of Normal equation is **O(D^2N) + O(D^3)**
125+
- Theoretical time complexity of Gradient Descent equation is **O((t+N)D^2)**
126+
127+
## Time vs Number of Features ⏳📊
128+
129+
![alt text](images/q8features.png?raw=true)
130+
131+
When the number of samples are kept constant, normal equation solution takes more time as it has a factor of D^3 whereas Gradient Descent has a factor of D^2 in the time complexity.
132+
133+
## Time vs Number of Samples ⏳📊
134+
135+
![alt text](images/q8samples.png?raw=true)
136+
137+
When the number of features are kept constant varying number of samples, it can be noticed that time for normal equation is still higher as compared to gradient descent because of computational expenses.
138+
139+
## Multicollinearity in Dataset ❗ ❗
140+
141+
- The gradient descent implementation works for the multicollinearity.
142+
- But as the multiplication factor increases, RMSE and MAE values takes a large shoot
143+
- It reduces the precision of the coefficients

collinear_dataset.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import numpy as np
2+
import pandas as pd
3+
import matplotlib.pyplot as plt
4+
from linearRegression.linearRegression import LinearRegression
5+
from metrics import *
6+
7+
np.random.seed(42)
8+
9+
N = 30
10+
11+
print("----------------------------------- Multi collinear ----------------------------------")
12+
13+
P = 4
14+
X = pd.DataFrame(np.random.randn(N, P))
15+
y = pd.Series(np.random.randn(N))
16+
17+
X[P] = X.iloc[:][P-1]*6
18+
# print(X)
19+
20+
LR = LinearRegression(fit_intercept=True)
21+
LR.fit_vectorised(X, y)
22+
y_hat = LR.predict(X)
23+
print('RMSE: ', rmse(y_hat, y))
24+
print('MAE: ', mae(y_hat, y))
25+
26+
print("----------------------------------------- Normal dataset -------------------------------------")
27+
28+
P = 5
29+
Xnew = pd.DataFrame(np.random.randn(N, P))
30+
ynew = pd.Series(np.random.randn(N))
31+
# print(Xnew)
32+
33+
LRnew = LinearRegression(fit_intercept=True)
34+
LRnew.fit_vectorised(Xnew, ynew)
35+
y_hatnew = LRnew.predict(Xnew)
36+
print('RMSE: ', rmse(y_hatnew, ynew))
37+
print('MAE: ', mae(y_hatnew, ynew))

compare_time.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import numpy as np
2+
import pandas as pd
3+
import matplotlib.pyplot as plt
4+
from linearRegression.linearRegression import LinearRegression
5+
import time
6+
7+
# np.random.seed(42)
8+
9+
grad = []
10+
normal = []
11+
num_features = []
12+
13+
N = 30
14+
for i in range (50,1000, 5):
15+
X = pd.DataFrame(np.random.randn(N, i))
16+
y = pd.Series(np.random.randn(N))
17+
18+
LR = LinearRegression(fit_intercept=True)
19+
start = time.time()
20+
LR.fit_vectorised(X, y)
21+
grad.append(time.time()-start)
22+
23+
LR_normal = LinearRegression(fit_intercept=True)
24+
start_time = time.time()
25+
LR_normal.fit_normal(X, y)
26+
normal.append(time.time()-start_time)
27+
num_features.append(i)
28+
29+
plt.plot(num_features, grad, label = 'Gradient Descent')
30+
plt.plot(num_features, normal, label = 'Normal Equation')
31+
plt.xlabel('Num of features')
32+
plt.ylabel('time in seconds')
33+
plt.legend(loc = 'best')
34+
plt.savefig('./images/q8features.png')
35+
plt.show()
36+
37+
grad = []
38+
normal = []
39+
num_samples = []
40+
P = 20
41+
42+
for i in range (50,2000, 5):
43+
X = pd.DataFrame(np.random.randn(i, P))
44+
y = pd.Series(np.random.randn(i))
45+
46+
LR = LinearRegression(fit_intercept=True)
47+
start = time.time()
48+
LR.fit_vectorised(X, y)
49+
grad.append(time.time()-start)
50+
51+
LR_normal = LinearRegression(fit_intercept=True)
52+
start_time = time.time()
53+
LR_normal.fit_normal(X, y)
54+
normal.append(time.time()-start_time)
55+
num_samples.append(i)
56+
57+
plt.plot(num_samples, grad, label = 'Gradient Descent')
58+
plt.plot(num_samples, normal, label = 'Normal Equation')
59+
plt.xlabel('Num of samples')
60+
plt.ylabel('time in seconds')
61+
plt.legend(loc = 'best')
62+
plt.savefig('./images/q8samples.png')
63+
plt.show()

contour_plot.gif

23.5 MB
Loading

degreevstheta.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import numpy as np
2+
import pandas as pd
3+
import matplotlib.pyplot as plt
4+
from preprocessing.polynomial_features import PolynomialFeatures
5+
6+
np.random.seed(42)
7+
8+
def normal_regression(X,y):
9+
X_transpose = np.transpose(X)
10+
A = np.linalg.inv(X_transpose.dot(X))
11+
B = X_transpose.dot(y)
12+
return A.dot(B)
13+
14+
lst = []
15+
degrees = [1,3,5,7,9]
16+
sample_size = []
17+
l = 0
18+
for N in range (10,200,40):
19+
# x = np.random.rand(N)
20+
x = np.array([i*np.pi/180 for i in range(N,300,4)])
21+
y = 4*x + 7 + np.random.normal(0,3,len(x))
22+
x = np.array(np.matrix(x).transpose())
23+
temp = []
24+
for degree in degrees:
25+
poly = PolynomialFeatures(degree,include_bias=True)
26+
X = poly.transform(x)
27+
coeff = normal_regression(X,y)
28+
temp.append(np.log(np.linalg.norm(np.array(coeff))))
29+
lst.append(temp)
30+
l+=1
31+
sample_size.append(len(x))
32+
33+
for i in range (1,l+1):
34+
plt.plot(degrees,lst[i-1],label='Num of Samples '+str(sample_size[i-1]))
35+
plt.xlabel("Value of degree")
36+
plt.ylabel("Log of L2 norm of coefficients")
37+
plt.legend(loc = 'best')
38+
plt.savefig('./images/q6plot.png')
39+
plt.show()

gif1.gif

317 KB
Loading

gif2.gif

381 KB
Loading

images/q5plot.png

24.1 KB
Loading

images/q6plot.png

52.6 KB
Loading

images/q8features.png

46.2 KB
Loading

images/q8samples.png

77.2 KB
Loading

line_plot.gif

1.03 MB
Loading

linearRegression/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

0 commit comments

Comments
 (0)