- SVM can handle data sets with lots of features and uses advanced mathematical trickery to cluster data - the "kernel trick" - we will use it as a function of separation line or the hyper plan.
- A kernel is actually a transformation math function on the existing features.
Helps to draw a clearer line between the different groups we want to classify. - SVM is a supervised learning technique and uses test/train.
- SVC - support vector classification - classify data using SVM.
We can use different "kernels" with SVC , ex: linear , RBF , polynomial - SVM actually draws a line(2 d) or a hyper plan in n dimensional space such that it maximizes the margins between classification groups. It maximizes the margins between the supportive vectors (which are the near by data points) to the decision boundary line.
python code example:
import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import MinMaxScaler from sklearn import svm, datasets from pylab import * # create fake income/age clusters for N people and k clusters ef createClusterdData(N, k): np.random.seed(1234) pointsPerCluster = float(N)/k X=[] y=[] for i in range(k): # from 20000 to 200000 incomeCentroid = np.random.uniform(20000, 200000) # from 20 to 70 ageCentroid = np.random.uniform(20, 70) for j in range(int(pointsPerCluster)): X.append([np.random.normal(incomeCentroid, 10000.0), np.random.normal(ageCentroid, 2.0)]) y.append(i) X = np.array(X) y= np.array(y) return X, y def plotPredictions(clf): # create a dense grid of points to sample xx, yy = np.meshgrid(np.arange(-1, 1, .001), np.arange(-1, 1, .001)) # convert to numpy arrays npx = xx.ravel() npy = yy.ravel() # Convert to list of 2D (income, age) points samplePoints = np.c_[npx, npy] # Generate predictive labels for each point Z = clf.predict(samplePoints) plt.figure(figsize=(8, 6)) # Reshape results to mach xx dimensions Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8) plt.scatter(X[:, 0], X[:, 1], c=y.astype(np.float)) plt.show() (X, y) = createClusterdData(100, 5) plt.figure(figsize=(8, 6)) # scatter column 0, column 1 ,c = color correlated by #label so each cluster will have different label plt.scatter(X[:, 0], X[:, 1], c=y.astype(np.float)) plt.show() scaling = MinMaxScaler(feature_range=(-1, 1)).fit(X) X = scaling.transform(X) plt.figure(figsize=(8, 6)) plt.scatter(X[:, 0], X[:, 1], c=y.astype(np.float)) plt.show() # use linear SVC to partition out graph into clusters C = 1.0svc = svm.SVC(kernel='linear', C=C).fit(X, y) plotPredictions(svc) print(svc.predict([[5000, 65]]))

No comments:
Post a Comment