Getting-and-Cleaning-Data-Course-Project/run_analysis.R at master · FrankLedo/Getting-and-Cleaning-Data-Course-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
## Getting and Cleaning Data Course Project
# The organization and use of functions is to avoid polluting the namespace
# as much as possible by keeping intermediate variables within function scope.

# Excessive commenting is present for the ease of peer review.

##### Configuration and setup
#setwd("~/dsgacdcp")
input_data_url <- "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
output_file_name <- "tidy.txt"

# Require/install prerequisites
load_prerequisites <- function(packages){
    for(package in packages) {
        if (!require(package, character.only = TRUE))
          install.packages(package)
        require(package, character.only = TRUE)
    }
}

# Given a URL pointing to a zip file, checks to see if the file, or extracted
# folder from the file exists in the current directory, and if not, downloads
# and extracts it.  Returns the directory name or input_file
get_extract_file <- function(url, input_file=""){
    file = basename(URLdecode(url))
    datadir <- gsub(".zip", "", file)
    if((nchar(input_file) && !file.exists(input_file))
        && (!file.exists(datadir))) {
        # No data directory, do we have the zip file?
        if(nchar(input_file)){
            message(sprintf("Did not find file: '%s'", datadir))
        } else {
            message(sprintf("Did not find '%s' directory", datadir))
        }
        if(!file.exists(file)){
            # No zip file either, download it
            msg <- sprintf("Downloading '%s' from %s...", file, url)
            message(msg)
            download.file(url=url, destfile=file, mode="w", method="curl")
            if(!file.exists(file)){
                # Still no zip file?  Something went wrong...
                message(sprintf("Error %s", msg))
                quit(1)
            } else {
                # Keep a record of when this data was downloaded next to the
                # data itself
                fc <- file(sprintf("%s-date-downloaded.txt", datadir))
                writeLines(date(), fc)
                close(fc)
            }
        } else {
            message("Found zip file...")
        }
        message("Unzipping file...")
        unzip(file)
    }
    if(nchar(input_file))
        return(input_file)
    else
        return(datadir)
}

# Extracts only the measurements on the mean and standard deviation for each
# measurement from the test and train data sets, returning a merged data set.
merge_data_sets <- function(datadir){
    ## Requirements 1-4 are handled in this function.  For purposes of
    ## efficiency, they are not handled in numeric order.

    ## Import and clean up feature names for readability and use as variable
    ## names.  This assists with requirement #4.
    # Remove hypen and uppercase the next character, keeping case intact,
    # following the initial character, removing parens and trailing hyphen
    # if present.
    # Example: "fBodyAccJerk-meanFreq()-X" -> "fBodyAccJerkMeanFreqX"
    features <- gsub("-([a-z])(.*)?\\(\\)-?",
                     read.table(sprintf("%s/features.txt", datadir))[,2],
                     replacement="\\U\\1\\E\\2", perl=TRUE)

    ## Requirement #2: Extracts only the measurements on the mean and standard
    ## deviation for each measurement.
    # Create vector of indices of columns containing 'Mean' or 'Std'.  We
    # can use this later to filter the data for both data sets.
    # We are only extracting the original measurements for time/frequency, and
    # this regular expression assumes we have already transformed the column
    # names when we read the features above.
    output_features <- grep("^(f|t).*(Mean|Std)", features)

    ## Read activity labels into a data frame
    activity_labels <- read.table(sprintf("%s/activity_labels.txt", datadir),
                                  col.names = c("id", "name"))

    # Create an empty data.frame to collect output data
    output <- data.frame()
    # Loop through the test and train data sets.  For each data set, read the
    # data from the 3 files (subject_%s.txt, X_%s.txt, and y_%s.txt), append
    # the activity, names, and merge the 3 tables appending the result into
    # the output data frame.
    for (source in c("test", "train")){
        message(sprintf("Processing %s data...", source))
        ## Requirement #4: Appropriately labels the data set with descriptive
        ## variable names. This is satisfied by the col.names parameters to
        ## read.table below
        subject <- read.table(
            sprintf("%s/%s/subject_%s.txt", datadir, source, source),
            col.names = "subject_id")
        # Note: Data is filtered here as per requirement #2, using the
        # output_features vector created above.
        data <- read.table(
            sprintf("%s/%s/X_%s.txt", datadir, source, source),
            col.names = features, check.names = FALSE)[,output_features]
        activities <- read.table(
            sprintf("%s/%s/y_%s.txt", datadir, source, source),
            col.names = "activity_id")

        ## Requirement #3: Uses descriptive activity names to name the
        ## activities in the data set.
        # Append activity names to the activities data frame
        activities$activity_name <- activity_labels$name[
            match(activities$activity_id, activity_labels$id)]

        ## Requirement #1: Merges the training and the test sets to create one
        ## data set.
        # We do this by column binding the subjects, activities, and
        # measurement data, then we append the result to our output
        # data.frame
        output <- rbind(output, cbind(subject, activities, data))
    }
    # Return filtered, labeled, merged training/test datasets as one data.frame
    return(output)
}

# Create tidy data set
create_tidy_set <- function(datadir, output_file_name) {
    # Requirement 5 is satisfied by this function

    # First handle requirements 1-4...
    data <- merge_data_sets(datadir)

    ## Requirement #5: From the data set in step 4, creates a second,
    ## independent tidy data set with the average of each variable for each
    ## activity and each subject.
    message("Pivoting data...")
    # Pivot the values of the data into individual rows, keeping the
    # identifier columns passed as id intact, resulting in a data frame
    # containing the following columns:
    #   "subject_id", "activity_id", "activity_name", "variable", "value"
    pivoted_data <- melt(data, id = c(
                         "subject_id", "activity_id", "activity_name"))
    message("Creating tidy data set...")
    # Output the mean of all the values for each variable
    tidy <- dcast(pivoted_data,
                  subject_id + activity_name ~ variable, mean)
    # write the tidy data out to a file in the current working directory
    message(sprintf("Writing tidy data set to '%s'...", output_file_name))
    write.table(tidy, file = output_file_name)

    ## I used the following during development to easily start my CodeBook
    #fc <- file("CodeBook.columns.txt")
    #writeLines(gsub("^", colnames(tidy), replacement="* "), fc)
    #close(fc)
}

load_prerequisites(c("data.table", "reshape2"))
datadir <- get_extract_file(input_data_url)
create_tidy_set(datadir, output_file_name)