# Author: Chris Rathman, 25 July 2000 # File: regression.py # Purpose: Simple linear and multivariate regression functions # Version: 1.00 import random import types #----------------------------------------------------------------------# # function: testMe # #----------------------------------------------------------------------# def testMe(): # Purpose: try out some simple regression tests # first order equation: Y = 1 + 2X myCoefficients = linearRegression([[1, 0], [3, 1], [5, 2]], 1) print myCoefficients # get the r-squared value for the regression myRSquared = linearRSquared([[1, 0], [3, 1], [5, 2]], myCoefficients) # first order equation: Y = 2 + 2X print linearRegression([[1, 0], [3, 1], [3, 0], [5, 1]], 1) # first order equation: Y = 100 + 50X myData = [] for x in range(100): myData.append([100 + x*50, x]) print linearRegression(myData, 1) # second order equation: Y = 1 + 2X + 3X^2 myData = [] for x in range(100): myData.append([1 + 2*x + 3*x*x, x]) print linearRegression(myData, 2) # third order equation: Y = 4 + 3X + 2X^2 + 1X^3 myData = [] for x in range(100): myData.append([4 + 3*x + 2*x*x + 1*x*x*x, x]) print linearRegression(myData, 3) # multivariate equation: Y = 6 + 5X + 4X^2 + 3XZ + 2Z + 1Z^2 myData = [] for x in range(100): z = random.randint(0, 100) myData.append([6 + 5*x + 4*x*x + 3*x*z + 2*z + 1*z*z, x, z]) myEquations = [] myEquations.append(lambda rawItem, coefIndex: 1) myEquations.append(lambda rawItem, coefIndex: rawItem[1]) myEquations.append(lambda rawItem, coefIndex: rawItem[1] * rawItem[1]) myEquations.append(lambda rawItem, coefIndex: rawItem[1] * rawItem[2]) myEquations.append(lambda rawItem, coefIndex: rawItem[2]) myEquations.append(lambda rawItem, coefIndex: rawItem[2] * rawItem[2]) print regression(myData, myEquations) #----------------------------------------------------------------------# # function: linearRegression # #----------------------------------------------------------------------# def linearRegression(rawData, equationOrder): # Purpose: Regress the coefficients for an equation of the form: # Y = C0 + C1*X + C2*X^2 + ... + Cn*X^n # The regression outputs a list of the coefficients ([C0,C1,..,C[n]) # rawData: A supplied list of Y and X values which serve as a basis # for computing the coefficients. The general form of the list is: # [[Y0, X0],[Y1, X1],...,[Ym,Xm]]. # equationOrder: The equation order is the power function for the X # variable where the number of coefficients returned is the equation + 1. # For example, an equation of 1 maps to Y = C0 + C1X. An equation order # of 2 maps to Y = C0 + C1*X + C2*X*X. # require part of contract assert(type(rawData) == types.ListType), \ "Raw data input must be a list" assert(equationOrder > 0), \ "Equation order must be greater than 0th order" assert(len(rawData) >= equationOrder), \ "Number of data points must be >= to number of coefficients be calculated" for each in (rawData): assert(type(each) == types.ListType), \ "Raw data input must be a list of data values" assert(len(each) > 1), \ "More than one data point is required for the raw data" assert(len(each) == len(rawData[0])), \ "All data points in raw data must have the same number items" xEquationForm = [lambda rawItem, coefIndex: pow(rawItem[1], coefIndex)] return regression(rawData, xEquationForm * (equationOrder+1)) #----------------------------------------------------------------------# # function: regression # #----------------------------------------------------------------------# def regression(rawData, xEquationForm, yEquationForm = lambda rawItem: rawItem[0]): # Purpose: Regress the coefficients for an equation of a generalized form # as supplied by the xEquationForm and yEquationForm functions: # Y = C0*X0 + C1*X1 + C2*X2 + ... + Cn*Xn # Where Y = yEquationForm() and Xn = xEquationForm[n]() # The regression outputs a list of the coefficients ([C0,C1,..,C[n]) # rawData: A supplied list of Y and X values which serve as a basis # for computing the coefficients. The general form of the list is: # [[Y0, X0],[Y1, X1],...,[Ym,Xm]]. # equationOrder: The equation order is the power function for the X # variable where the number of coefficients returned is the equation + 1. # For example, an equation of 1 maps to Y = C0 + C1X. An equation order # of 2 maps to Y = C0 + C1*X + C2*X*X. # require part of contract assert(type(rawData) == types.ListType), \ "Raw data input must be a list" assert(type(xEquationForm) == types.ListType), \ "X Equation form must be defined in a list" assert(type(yEquationForm == types.FunctionType)), \ "Y Equation form must be a lambda function" assert(len(xEquationForm) > 0), \ "X Equation form must not be an empty list" assert(len(rawData) >= len(xEquationForm)), \ "Number of data points must be >= to number of coefficients be calculated" for each in (xEquationForm): assert(type(each) == types.FunctionType), \ "X Equation form must be lambda functions" for each in (rawData): assert(type(each) == types.ListType), \ "Raw data input must be a list of data values" assert(len(each) > 1), \ "More than one data point is required for the raw data" assert(len(each) == len(rawData[0])), \ "All data points in raw data must have the same number items" # number of coefficients being solved for numCoefficients = len(xEquationForm) # the value of each term for the equation term = [0] * numCoefficients # the matrices for the simultaneous equations B = [0] * numCoefficients A = [] for i in range(numCoefficients): A.append([0] * numCoefficients) # loop through all the raw data samples for each in rawData: # sum the y values yCurrent = float(yEquationForm(each)) B[0] = B[0] + yCurrent # sum the x values for i in range(numCoefficients): term[i] = float(xEquationForm[i](each, i)) A[0][i] = A[0][i] + term[i] # now set up the rest of rows in the matrices # (multiplying each row by each term) for i in range(1, numCoefficients): B[i] = B[i] + yCurrent * term[i] for j in range(numCoefficients): A[i][j] = A[i][j] + term[i] * term[j] # solve for the coefficients return gauss(A, B) #----------------------------------------------------------------------# # function: linearRSquared # #----------------------------------------------------------------------# def linearRSquared(rawData, coefficients): # Purpose: Compute the R-Squared statistic for the supplied coefficients # require part of contract assert(type(rawData) == types.ListType), \ "Raw data input must be a list" assert(type(coefficients) == types.ListType), \ "Computed coefficients must be a list" assert(len(coefficients) > 0), \ "At least coefficient is required" assert(len(rawData) >= len(coefficients)), \ "Number of data points must be >= to number of coefficients" for each in (rawData): assert(type(each) == types.ListType), \ "Raw data input must be a list of data values" assert(len(each) > 1), \ "More than one data point is required for the raw data" assert(len(each) == len(rawData[0])), \ "All data points in raw data must have the same number items" xEquationForm = [lambda rawItem, coefIndex: pow(rawItem[1], coefIndex)] return solveRSquared(rawData, coefficients, xEquationForm * len(coefficients)) #----------------------------------------------------------------------# # function: solveRSquared # #----------------------------------------------------------------------# def solveRSquared(rawData, coefficients, xEquationForm, \ yEquationForm = lambda rawItem: rawItem[0]): # Purpose: Compute the R-Squared statistic for the supplied coefficients # require part of contract assert(type(rawData) == types.ListType), \ "Raw data input must be a list" assert(type(xEquationForm) == types.ListType), \ "X Equation form must be defined in a list" assert(type(yEquationForm == types.FunctionType)), \ "Y Equation form must be a lambda function" assert(len(xEquationForm) > 0), \ "X Equation form must not be an empty list" assert(len(rawData) >= len(xEquationForm)), \ "Number of data points must be >= to number of coefficients be calculated" for each in (xEquationForm): assert(type(each) == types.FunctionType), \ "X Equation form must be lambda functions" for each in (rawData): assert(len(each) == len(rawData[0])), \ "All data points in raw data must have the same number items" assert(len(each) > 1), \ "More than one data point is required for the raw data" # number of coefficients being solved for numCoefficients = len(xEquationForm) # sum of y*y ysquare = 0 # number of raw data samples samples = len(rawData) # the value of each term for the equation term = [0] * numCoefficients # the matrices for the simultaneous equations B = [0] * numCoefficients A = [] for i in range(numCoefficients): A.append([0] * numCoefficients) # use the first column as the default value for the dependent variable if yEquationForm == []: yEquationForm = lambda rawItem: rawItem[0] # loop through all the raw data samples for each in rawData: # sum the y values yCurrent = float(yEquationForm(each)) B[0] = B[0] + yCurrent ysquare = ysquare + yCurrent * yCurrent # sum the x values for i in range(numCoefficients): term[i] = float(xEquationForm[i](each, i)) A[0][i] = A[0][i] + term[i] # now set up the rest of rows in the matrices # (multiplying each row by each term) for i in range(1, numCoefficients): B[i] = B[i] + float(yCurrent * term[i]) for j in range(numCoefficients): A[i][j] = A[i][j] + term[i] * term[j] # calculate the r-squared statistic sumsquare = 0 yaverage = B[0] / samples for i in range(numCoefficients): xaverage = A[0][i] / samples sumsquare = sumsquare + coefficients[i] * (B[i] - (samples * xaverage * yaverage)) return sumsquare / (ysquare - (samples * yaverage * yaverage)) #----------------------------------------------------------------------# # function: gauss # #----------------------------------------------------------------------# def gauss(AMatrix, BMatrix): # solve linear equations of the form: # |A00 A01 ... A0n| |coefficient0| |B0| # |A10 A11 ... A1n| * |coefficient1| = |B1| # |... ... ... ...| | ... | |..| # |An0 An1 ... Ann| |coefficientn| |Bn| # where |A| and |B| are supplied and |coefficient| is the solution. # require part of contract assert(type(AMatrix) == types.ListType), \ "Input matrix must be a list" assert(type(BMatrix) == types.ListType), \ "Input matrix must be a list" assert(len(AMatrix) > 0), \ "Input matrix must not be an empty list" assert(len(AMatrix) == len(BMatrix)), \ "A and B matrix must have same number of rows" for each in (AMatrix): assert(len(each) == len(AMatrix)), \ "A matrix must be of square (NxN) dimensions" # get the length of the matrix n = len(AMatrix) # copy the matrices to local variables - inplace substitution used A = map(lambda x: list(x), AMatrix) B = list(BMatrix) # initialize the output results coefficients = [0] * n # condition the matrix for the solution for i in range(n-1): pivot = A[i][i] assert(pivot != 0), \ "Zero pivot value encountered" for j in range(i+1, n): multiplier = float(A[j][i]) / pivot for k in range(i+1, n): A[j][k] = A[j][k] - multiplier * A[i][k] B[j] = B[j] - multiplier * B[i] # solve for the coefficients assert(A[n-1][n-1] != 0), \ "Divide by zero encountered in solution" coefficients[n-1] = float(B[n-1]) / A[n-1][n-1] for i in range(n-2, -1, -1): top = B[i] for j in range(i+1, n): top = top - (A[i][j]* coefficients[j]) assert(A[i][i] != 0), \ "Divide by zero encountered in solution" coefficients[i] = float(top) / A[i][i] return coefficients # test for module when run in isolation if __name__ == "__main__": testMe()