-
Notifications
You must be signed in to change notification settings - Fork 0
Dataset.jl and Corresponding Tests #3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
7dc2968
47fc954
787a88d
2dd2295
173cdd1
042d5d6
70cfa67
9b8c48c
bf9707b
214e74d
a87c905
9e1ad08
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,16 @@ | ||
| name = "RidgeRegression" | ||
| uuid = "739161c8-60e1-4c49-8f89-ff30998444b1" | ||
| authors = ["Vivak Patel <vp314@users.noreply.github.com>"] | ||
| version = "0.1.0" | ||
| authors = ["Eton Tackett <etont@icloud.com>", "Vivak Patel <vp314@users.noreply.github.com>"] | ||
|
|
||
| [deps] | ||
| CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" | ||
| DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" | ||
| Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" | ||
| RidgeRegression = "739161c8-60e1-4c49-8f89-ff30998444b1" | ||
|
|
||
| [compat] | ||
| CSV = "0.10.15" | ||
| DataFrames = "1.8.1" | ||
| Downloads = "1.7.0" | ||
| julia = "1.12.4" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,7 @@ makedocs(; | |
| ), | ||
| pages=[ | ||
| "Home" => "index.md", | ||
| "Design" => "design.md", | ||
| ], | ||
| ) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,11 @@ | ||
| module RidgeRegression | ||
|
|
||
| # Write your package code here. | ||
| using CSV | ||
| using DataFrames | ||
| using Downloads | ||
|
|
||
| include("dataset.jl") | ||
|
|
||
| export Dataset, csv_dataset, one_hot_encode | ||
|
|
||
| end |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,119 @@ | ||
| """ | ||
| Dataset(name, X, y) | ||
|
|
||
| Contains datasets for ridge regression experiments. | ||
|
|
||
| # Fields | ||
| - `name::String`: Name of dataset | ||
| - `X::Matrix{Float64}`: Matrix of variables/features | ||
| - `y::Vector{Float64}`: Target vector | ||
|
|
||
| # Throws | ||
| - `ArgumentError`: If rows in `X` does not equal length of `y`. | ||
|
Comment on lines
+1
to
+12
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There should be documentation for the struct being created and then there should be documentation for the constructor in the same docstring. |
||
|
|
||
| !!! note | ||
| Used as the experimental unit for ridge regression experiments. | ||
| """ | ||
| struct Dataset | ||
| name::String | ||
| X::Matrix{Float64} | ||
| y::Vector{Float64} | ||
|
|
||
| function Dataset(name::String, X::AbstractMatrix, y::AbstractVector) | ||
| size(X, 1) == length(y) || | ||
| throw(ArgumentError("X and y must have same number of rows")) | ||
|
|
||
| new(name, Matrix{Float64}(X), Vector{Float64}(y)) | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you are interested in looking at sparse design matrices, this functionality precludes that as any matrix would be converted to |
||
| end | ||
| end | ||
|
|
||
| """ | ||
| one_hot_encode(Xdf::DataFrame; drop_first=true) | ||
|
|
||
| One-hot encode categorical (string-like) features in `Xdf`. | ||
|
|
||
| # Arguments | ||
| - `Xdf::DataFrame`: Input DataFrame containing features and response vector `y`. | ||
|
|
||
| # Keyword Arguments | ||
| - `drop_first::Bool=true`: If `true`, drop the first dummy column for | ||
| each categorical feature to avoid multicollinearity. | ||
|
|
||
| # Returns | ||
| - `Matrix{Float64}`: A numeric matrix containing the encoded feature. | ||
| """ | ||
| function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64} | ||
vp314 marked this conversation as resolved.
Show resolved
Hide resolved
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe this function should focus on one-hot encoding a specific column provided to the function rather than an entire data frame as we do not always know which columns should be one-hot encoded just from their type. Think of categorical data that is saved in the data set as integers rather than as words. |
||
| n = nrow(Xdf) | ||
| cols = Vector{Vector{Float64}}() | ||
|
|
||
| for name in names(Xdf) #Selecting columns that aren't the target variable and pushing them to the columns. | ||
| col = Xdf[!, name] | ||
| if eltype(col) <: Real | ||
| push!(cols, Float64.(col)) | ||
| continue | ||
| end | ||
|
|
||
| scol = string.(col) # Convert to string for categorical processing. | ||
| lv = unique(scol) #Get unique category levels. | ||
| ind = scol .== permutedims(lv) #Create indicator matrix for each level of the categorical variable. | ||
| #Permutedims is used to align the dimensions for broadcasting. | ||
| #Broadcasting compares each element of `scol` with each level in `lv`, resulting in a matrix where each column corresponds to a level and contains `true` for rows that match that level and `false` otherwise. | ||
|
|
||
| if drop_first && size(ind, 2) > 1 #Drop the first column of the indicator matrix to avoid multicollinearity if drop_first is true and there are multiple levels. | ||
| ind = ind[:, 2:end] | ||
| end | ||
|
|
||
| for j in 1:size(ind, 2) | ||
| push!(cols, Float64.(ind[:, j])) #Convert the boolean indicator columns to Float64 and add them to the list of columns. | ||
| end | ||
| end | ||
|
|
||
| p = length(cols) | ||
| X = Matrix{Float64}(undef, n, p) | ||
| for j in 1:p | ||
| X[:, j] = cols[j] | ||
| end | ||
|
|
||
| return Matrix{Float64}(X) | ||
|
|
||
| end | ||
| """ | ||
| csv_dataset(path_or_url; target_col, name="csv_dataset") | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a bad function name. |
||
|
|
||
| Load a dataset from a CSV file or URL. | ||
|
|
||
| # Arguments | ||
| - `path_or_url::String`: Local file path or web URL containing CSV data. | ||
|
|
||
| # Keyword Arguments | ||
| - `target_col`: Column index or column name containing the response variable. | ||
| - `name::String="csv_dataset"`: Dataset name. | ||
|
|
||
| # Returns | ||
| - `Dataset`: A dataset containing the encoded feature matrix `X`, response vector `y`, and dataset name. | ||
| """ | ||
| function csv_dataset(path_or_url::String; | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This does not follow BlueStyle |
||
| target_col, | ||
| name::String = "csv_dataset" | ||
| ) | ||
|
|
||
| filepath = | ||
| startswith(path_or_url, "http") ? | ||
| Downloads.download(path_or_url) : | ||
| path_or_url | ||
|
|
||
| df = DataFrame(CSV.File(filepath)) #Read CSV file into a DataFrame. | ||
| df = dropmissing(df) #Remove rows with missing values. | ||
| Xdf = select(df, DataFrames.Not(target_col)) #Select all columns except the target column for features. | ||
|
|
||
| y = target_col isa Int ? | ||
| df[:, target_col] : #If target_col is an integer, use it as a column index to extract the target variable from the DataFrame. | ||
| df[:, Symbol(target_col)] #Extract the target variable based on whether target_col is an index or a name. | ||
|
|
||
|
|
||
| X = one_hot_encode(Xdf; drop_first = true) | ||
|
|
||
|
|
||
|
|
||
| return Dataset(name, Matrix{Float64}(X), Vector{Float64}(y)) | ||
| end | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1,9 @@ | ||
| [deps] | ||
| CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" | ||
| DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" | ||
| RidgeRegression = "739161c8-60e1-4c49-8f89-ff30998444b1" | ||
| Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" | ||
|
|
||
| [compat] | ||
| CSV = "0.10" | ||
| DataFrames = "1" |
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Review unit testing documentation in Julia to see how to do this correctly. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| using Test | ||
| using DataFrames | ||
| using CSV | ||
| using RidgeRegression | ||
| @testset "Dataset" begin | ||
| X = [1 2; 3 4] | ||
| y = [10, 20] | ||
| d = Dataset("toy", X, y) | ||
|
|
||
| @test d.name == "toy" | ||
| @test d.X isa Matrix{Float64} | ||
| @test d.y isa Vector{Float64} | ||
| @test size(d.X) == (2, 2) | ||
| @test length(d.y) == 2 | ||
| @test d.X[1, 1] == 1.0 | ||
| @test d.y[2] == 20.0 | ||
|
|
||
| @test_throws ArgumentError Dataset("bad", X, [1, 2, 3]) | ||
| end | ||
|
|
||
| @testset "one_hot_encode" begin | ||
| df = DataFrame( | ||
| A = ["red", "blue", "red", "green"], | ||
| B = [1, 2, 3, 4], | ||
| C = ["small", "large", "medium", "small"] | ||
| ) | ||
|
|
||
| X = redirect_stdout(devnull) do | ||
| one_hot_encode(df; drop_first = true) | ||
| end | ||
|
|
||
| @test size(X) == (4, 5) | ||
| @test X[:, 3] == [1.0, 2.0, 3.0, 4.0] | ||
| @test all(x -> x == 0.0 || x == 1.0, X[:, [1,2,4,5]]) | ||
| @test all(vec(sum(X[:, 1:2]; dims=2)) .<= 1) | ||
| @test all(vec(sum(X[:, 4:5]; dims=2)) .<= 1) | ||
| end | ||
|
|
||
| @testset "csv_dataset" begin | ||
| tmp = tempname() * ".csv" | ||
| df = DataFrame( | ||
| a = [1.0, 2.0, missing, 4.0], | ||
| b = ["x", "y", "y", "x"], | ||
| y = [10.0, 20.0, 30.0, 40.0] | ||
| ) | ||
| CSV.write(tmp, df) | ||
|
|
||
| d = redirect_stdout(devnull) do | ||
| csv_dataset(tmp; target_col=:y, name="tmp") | ||
| end | ||
|
|
||
| @test d.name == "tmp" | ||
| @test d.X isa Matrix{Float64} | ||
| @test d.y isa Vector{Float64} | ||
|
|
||
| @test length(d.y) == 3 | ||
| @test size(d.X, 1) == 3 | ||
| @test d.y == [10.0, 20.0, 40.0] | ||
|
|
||
| d2 = redirect_stdout(devnull) do | ||
| csv_dataset(tmp; target_col=3, name="tmp2") | ||
| end | ||
| @test d2.y == [10.0, 20.0, 40.0] | ||
| @test size(d2.X, 1) == 3 | ||
| end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All dependencies should appear in the Project.toml file. You should activate the package environment and then "add ..." your dependencies to ensure compatibility and correct environment for the package.