3.1 R and SAS Syntax
Below are some examples that use adsl
to show how common operations are done in SAS and R.
3.1.1 Packages and Sample Data
# Data
"https://github.com/phuse-org/TestDataFactory/raw/main/Updated/TDF_ADaM/"
adam_path <- haven::read_xpt(paste0(adam_path, "adsl.xpt"))
adsl <-
# Select a few variables
adsl %>%
adsl <- dplyr::select(STUDYID, USUBJID, SUBJID, AGE, TRT01P, TRTSDT, TRTEDT, RACE, SEX, DISCONFL)
3.1.2 PROC CONTENTS \(\rightarrow\) summary()
To explore the variables of your dataset you can use PROC contents in SAS:
proc contents data = adsl;
run;
In R, you can use str()
and summary()
instead:
str(adsl)
## tibble [254 × 10] (S3: tbl_df/tbl/data.frame)
## $ STUDYID : chr [1:254] "CDISCPILOT01" "CDISCPILOT01" "CDISCPILOT01" "CDISCPILOT01" ...
## ..- attr(*, "label")= chr "Study Identifier"
## $ USUBJID : chr [1:254] "01-701-1015" "01-701-1023" "01-701-1028" "01-701-1033" ...
## ..- attr(*, "label")= chr "Unique Subject Identifier"
## $ SUBJID : chr [1:254] "1015" "1023" "1028" "1033" ...
## ..- attr(*, "label")= chr "Subject Identifier for the Study"
## $ AGE : num [1:254] 63 64 71 74 77 85 68 81 84 52 ...
## ..- attr(*, "label")= chr "Age"
## $ TRT01P : chr [1:254] "Placebo" "Placebo" "Xanomeline High Dose" "Xanomeline Low Dose" ...
## ..- attr(*, "label")= chr "Planned Treatment for Period 01"
## $ TRTSDT : Date[1:254], format: "2014-01-02" "2012-08-05" ...
## $ TRTEDT : Date[1:254], format: "2014-07-02" "2012-09-01" ...
## $ RACE : chr [1:254] "WHITE" "WHITE" "WHITE" "WHITE" ...
## ..- attr(*, "label")= chr "Race"
## $ SEX : chr [1:254] "F" "M" "M" "M" ...
## ..- attr(*, "label")= chr "Sex"
## $ DISCONFL: chr [1:254] "" "Y" "" "Y" ...
## ..- attr(*, "label")= chr "Did the Subject Discontinue the Study?"
summary(adsl)
## STUDYID USUBJID SUBJID AGE
## Length:254 Length:254 Length:254 Min. :51.00
## Class :character Class :character Class :character 1st Qu.:70.00
## Mode :character Mode :character Mode :character Median :77.00
## Mean :75.09
## 3rd Qu.:81.00
## Max. :89.00
## TRT01P TRTSDT TRTEDT
## Length:254 Min. :2012-07-09 Min. :2012-08-28
## Class :character 1st Qu.:2013-01-26 1st Qu.:2013-05-12
## Mode :character Median :2013-06-13 Median :2013-09-27
## Mean :2013-06-17 Mean :2013-10-10
## 3rd Qu.:2013-11-07 3rd Qu.:2014-03-15
## Max. :2014-09-02 Max. :2015-03-05
## RACE SEX DISCONFL
## Length:254 Length:254 Length:254
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
3.1.3 PROC FREQ \(\rightarrow\) count()
In order to get the frequencies for one variables you use PROC FREQ in SAS.
PROC FREQ data = adsl;
TABLES SEX;
RUN;
And for cross tables:
PROC FREQ data = adsl;
TABLES SEX * TRT01P * RACE;
RUN;
In R we use table()
or count()
from the tidyverse package:
For One Variable
table(adsl$SEX)
%>%
adsl count(SEX)
##
## F M
## 143 111
SEX | n |
---|---|
F | 143 |
M | 111 |
For Multiple Variables
#for multiple variables
table(adsl$SEX, adsl$TRT01P, adsl$RACE)
%>%
adsl count(SEX, TRT01P, RACE)
## , , = AMERICAN INDIAN OR ALASKA NATIVE
##
##
## Placebo Xanomeline High Dose Xanomeline Low Dose
## F 0 0 0
## M 0 1 0
##
## , , = BLACK OR AFRICAN AMERICAN
##
##
## Placebo Xanomeline High Dose Xanomeline Low Dose
## F 5 6 6
## M 3 3 0
##
## , , = WHITE
##
##
## Placebo Xanomeline High Dose Xanomeline Low Dose
## F 48 34 44
## M 30 40 34
SEX | TRT01P | RACE | n |
---|---|---|---|
F | Placebo | BLACK OR AFRICAN AMERICAN | 5 |
F | Placebo | WHITE | 48 |
F | Xanomeline High Dose | BLACK OR AFRICAN AMERICAN | 6 |
F | Xanomeline High Dose | WHITE | 34 |
F | Xanomeline Low Dose | BLACK OR AFRICAN AMERICAN | 6 |
F | Xanomeline Low Dose | WHITE | 44 |
M | Placebo | BLACK OR AFRICAN AMERICAN | 3 |
M | Placebo | WHITE | 30 |
M | Xanomeline High Dose | AMERICAN INDIAN OR ALASKA NATIVE | 1 |
M | Xanomeline High Dose | BLACK OR AFRICAN AMERICAN | 3 |
M | Xanomeline High Dose | WHITE | 40 |
M | Xanomeline Low Dose | WHITE | 34 |
3.1.4 KEEP/DROP \(\rightarrow\) select()
DATA adsl2;
SET adsl;
KEEP subjidn; *or drop subjidn;
RUN;
In R we use the select()
function from the tidyverse package.
# to keep (only) SUBJID
%>%
adsl select(SUBJID) %>%
head() # keep only first 6 rows
# to keep all variables except SUBJID
%>%
adsl select(-SUBJID) %>%
head()
SUBJID |
---|
1015 |
1023 |
1028 |
1033 |
1034 |
1047 |
STUDYID | USUBJID | AGE | TRT01P | TRTSDT | TRTEDT | RACE | SEX | DISCONFL |
---|---|---|---|---|---|---|---|---|
CDISCPILOT01 | 01-701-1015 | 63 | Placebo | 2014-01-02 | 2014-07-02 | WHITE | F | |
CDISCPILOT01 | 01-701-1023 | 64 | Placebo | 2012-08-05 | 2012-09-01 | WHITE | M | Y |
CDISCPILOT01 | 01-701-1028 | 71 | Xanomeline High Dose | 2013-07-19 | 2014-01-14 | WHITE | M | |
CDISCPILOT01 | 01-701-1033 | 74 | Xanomeline Low Dose | 2014-03-18 | 2014-03-31 | WHITE | M | Y |
CDISCPILOT01 | 01-701-1034 | 77 | Xanomeline High Dose | 2014-07-01 | 2014-12-30 | WHITE | F | |
CDISCPILOT01 | 01-701-1047 | 85 | Placebo | 2013-02-12 | 2013-03-09 | WHITE | F | Y |
3.1.5 Subsetting data (WHERE/IF) \(\rightarrow\) filter()
DATA female;
SET adsl;
WHERE SEX = 'F';
RUN;
%>%
adsl filter(SEX == "F") %>%
head()
# multiple conditions:
%>%
adsl filter(SEX == "F" & AGE > 70) %>%
head()
STUDYID | USUBJID | SUBJID | AGE | TRT01P | TRTSDT | TRTEDT | RACE | SEX | DISCONFL |
---|---|---|---|---|---|---|---|---|---|
CDISCPILOT01 | 01-701-1015 | 1015 | 63 | Placebo | 2014-01-02 | 2014-07-02 | WHITE | F | |
CDISCPILOT01 | 01-701-1034 | 1034 | 77 | Xanomeline High Dose | 2014-07-01 | 2014-12-30 | WHITE | F | |
CDISCPILOT01 | 01-701-1047 | 1047 | 85 | Placebo | 2013-02-12 | 2013-03-09 | WHITE | F | Y |
CDISCPILOT01 | 01-701-1111 | 1111 | 81 | Xanomeline Low Dose | 2012-09-07 | 2012-09-16 | WHITE | F | Y |
CDISCPILOT01 | 01-701-1133 | 1133 | 81 | Xanomeline High Dose | 2012-10-28 | 2013-04-28 | WHITE | F | |
CDISCPILOT01 | 01-701-1146 | 1146 | 75 | Xanomeline High Dose | 2013-05-20 | 2013-06-26 | WHITE | F | Y |
STUDYID | USUBJID | SUBJID | AGE | TRT01P | TRTSDT | TRTEDT | RACE | SEX | DISCONFL |
---|---|---|---|---|---|---|---|---|---|
CDISCPILOT01 | 01-701-1034 | 1034 | 77 | Xanomeline High Dose | 2014-07-01 | 2014-12-30 | WHITE | F | |
CDISCPILOT01 | 01-701-1047 | 1047 | 85 | Placebo | 2013-02-12 | 2013-03-09 | WHITE | F | Y |
CDISCPILOT01 | 01-701-1111 | 1111 | 81 | Xanomeline Low Dose | 2012-09-07 | 2012-09-16 | WHITE | F | Y |
CDISCPILOT01 | 01-701-1133 | 1133 | 81 | Xanomeline High Dose | 2012-10-28 | 2013-04-28 | WHITE | F | |
CDISCPILOT01 | 01-701-1146 | 1146 | 75 | Xanomeline High Dose | 2013-05-20 | 2013-06-26 | WHITE | F | Y |
CDISCPILOT01 | 01-701-1153 | 1153 | 79 | Placebo | 2013-09-23 | 2014-03-16 | WHITE | F |
3.1.6 Sorting data
PROC SORT data=adsl out=adsl_sort_age;
BY AGE;
RUN;
In R we use the arrange()
function from tidyverse:
# ascending
%>%
adsl arrange(AGE)%>%
head()
# descending
%>%
adsl arrange(-AGE)%>%
head()
STUDYID | USUBJID | SUBJID | AGE | TRT01P | TRTSDT | TRTEDT | RACE | SEX | DISCONFL |
---|---|---|---|---|---|---|---|---|---|
CDISCPILOT01 | 01-701-1341 | 1341 | 51 | Xanomeline Low Dose | 2013-01-05 | 2013-01-26 | WHITE | M | Y |
CDISCPILOT01 | 01-701-1118 | 1118 | 52 | Placebo | 2014-03-12 | 2014-09-09 | WHITE | M | |
CDISCPILOT01 | 01-709-1007 | 1007 | 54 | Xanomeline Low Dose | 2012-07-31 | 2012-08-28 | WHITE | F | Y |
CDISCPILOT01 | 01-701-1180 | 1180 | 56 | Xanomeline High Dose | 2013-02-12 | 2013-03-18 | WHITE | M | Y |
CDISCPILOT01 | 01-701-1239 | 1239 | 56 | Xanomeline High Dose | 2014-01-11 | 2014-07-10 | WHITE | M | |
CDISCPILOT01 | 01-701-1287 | 1287 | 56 | Xanomeline High Dose | 2014-01-25 | 2014-07-26 | WHITE | F |
STUDYID | USUBJID | SUBJID | AGE | TRT01P | TRTSDT | TRTEDT | RACE | SEX | DISCONFL |
---|---|---|---|---|---|---|---|---|---|
CDISCPILOT01 | 01-710-1083 | 1083 | 89 | Placebo | 2013-07-22 | 2013-08-01 | WHITE | F | Y |
CDISCPILOT01 | 01-703-1295 | 1295 | 88 | Xanomeline High Dose | 2013-11-21 | 2014-04-19 | WHITE | F | Y |
CDISCPILOT01 | 01-710-1002 | 1002 | 88 | Xanomeline Low Dose | 2014-01-14 | 2014-01-18 | WHITE | M | Y |
CDISCPILOT01 | 01-710-1368 | 1368 | 88 | Placebo | 2013-10-23 | 2014-04-24 | WHITE | F | |
CDISCPILOT01 | 01-714-1035 | 1035 | 88 | Placebo | 2014-04-17 | 2014-10-16 | WHITE | F | |
CDISCPILOT01 | 01-701-1387 | 1387 | 87 | Placebo | 2014-03-12 | 2014-03-25 | WHITE | F | Y |
3.1.7 Creating new variables
DATA adsl;
SET adsl;
length AGEGR1 $20.; * length function in R means something different
IF age > 50 then AGEGR1 = ‘> 50 years old’;
ELSE if age <= 50 then AGEGR1 = ‘<= 50 years old’;
run;
In R we use the mutate()
function:
%>%
adsl mutate(AGEGR1 = case_when(
> 50 ~"> 50 years old",
AGE <= 50 ~"<= 50 years old",
AGE %>%
)) head()
STUDYID | USUBJID | SUBJID | AGE | TRT01P | TRTSDT | TRTEDT | RACE | SEX | DISCONFL | AGEGR1 |
---|---|---|---|---|---|---|---|---|---|---|
CDISCPILOT01 | 01-701-1015 | 1015 | 63 | Placebo | 2014-01-02 | 2014-07-02 | WHITE | F | > 50 years old | |
CDISCPILOT01 | 01-701-1023 | 1023 | 64 | Placebo | 2012-08-05 | 2012-09-01 | WHITE | M | Y | > 50 years old |
CDISCPILOT01 | 01-701-1028 | 1028 | 71 | Xanomeline High Dose | 2013-07-19 | 2014-01-14 | WHITE | M | > 50 years old | |
CDISCPILOT01 | 01-701-1033 | 1033 | 74 | Xanomeline Low Dose | 2014-03-18 | 2014-03-31 | WHITE | M | Y | > 50 years old |
CDISCPILOT01 | 01-701-1034 | 1034 | 77 | Xanomeline High Dose | 2014-07-01 | 2014-12-30 | WHITE | F | > 50 years old | |
CDISCPILOT01 | 01-701-1047 | 1047 | 85 | Placebo | 2013-02-12 | 2013-03-09 | WHITE | F | Y | > 50 years old |
3.1.8 Handling of missing values
Missing values in SAS and R:
- SAS
- Missing value is a blank/a single decimal point for character/numeric variables, see details in Missing Values in SAS;
- R
- Missing data in R appears as
NA
.NA
is not a string nor a numeric value, but an indicator of missingness, see detailed examples in Advanced R #3.2.3 Missing values; NA
and""
are different in R:""
is a blank string, whileNA
is missing;- “Missing values (
NA
) andNaN
values are regarded as non-comparable even to themselves, so comparisons involving them will always result inNA
” admiral::convert_blanks_to_na()
can turn SAS blank strings into proper RNA
(from admiral documentation).
- Missing data in R appears as
# Unlike SAS, space(s) != blank string in R,
" " == ""
## [1] FALSE
# is.na to check if it's missing
c(" ", "Y", NA_character_) %>% is.na()
## [1] FALSE FALSE TRUE
# NA is non-comparable, result in NA
c(" ", "Y", NA_character_) != "Y"
## [1] TRUE FALSE NA
Unexpected results might occur with “SAS habits”, below are three examples to illustrate the different handling of missing values in R.
3.1.8.1 Subsetting data
!=
is not the same in R when it involves NA
# add one more row to the data with missing AGE and DISCONFL, select the first 3 rows
adsl %>%
adsl_na_example <- add_row(SUBJID = "1", DISCONFL = NA_character_, .before = T) %>%
slice(1:3) %>%
select(SUBJID, DISCONFL)
# demo data
adsl_na_example
SUBJID | DISCONFL |
---|---|
1 | NA |
1015 | |
1023 | Y |
Subset to DISCONFL != "Y"
, only DISCONFL = " "
is selected, not NA
%>% filter(DISCONFL != "Y") adsl_na_example
SUBJID | DISCONFL |
---|---|
1015 |
To display NA
, add is.na(DISCONFL)
to the filter
# include is.na(DISCONFL) in the filter
%>% filter(DISCONFL != "Y" | is.na(DISCONFL)) adsl_na_example
SUBJID | DISCONFL |
---|---|
1 | NA |
1015 |
3.1.8.2 Sorting data
In SAS, missing is treated as the smallest value in proc sort
; while in R, NA
is always at the bottom after ascending or descending sorting.
Create demo data
# add one more row to the data with missing AGE and DISCONFL
# select the first 3 rows
adsl %>%
adsl_na_sort <- add_row(SUBJID = "1", AGE = NA_integer_, .before = T) %>%
slice(1:5) %>%
select(SUBJID, AGE)
SUBJID | AGE |
---|---|
1 | NA |
1015 | 63 |
1023 | 64 |
1028 | 71 |
1033 | 74 |
Sort by ascending order of AGE
# ascending
%>%
adsl_na_sort arrange(AGE)
SUBJID | AGE |
---|---|
1015 | 63 |
1023 | 64 |
1028 | 71 |
1033 | 74 |
1 | NA |
Sort by descending order of AGE
# descending
%>%
adsl_na_sort arrange(-AGE)
SUBJID | AGE |
---|---|
1033 | 74 |
1028 | 71 |
1023 | 64 |
1015 | 63 |
1 | NA |
NA
can also be placed at the top with below workaround
# to be consistent with SAS - NA at the top when ascending
%>%
adsl_na_sort arrange(!is.na(AGE), AGE)
SUBJID | AGE |
---|---|
1 | NA |
1015 | 63 |
1023 | 64 |
1028 | 71 |
1033 | 74 |
3.1.8.3 Creating new variables
When AGE is missing, AGEGR1 is set to ‘<= 50 years old’ in SAS; while in R it is set to NA_character_
.
%>%
adsl_na_sort mutate(AGEGR1 = case_when(
> 50 ~"> 50 years old",
AGE <= 50 ~"<= 50 years old",
AGE ))
SUBJID | AGE | AGEGR1 |
---|---|---|
1 | NA | NA |
1015 | 63 | > 50 years old |
1023 | 64 | > 50 years old |
1028 | 71 | > 50 years old |
1033 | 74 | > 50 years old |
3.1.9 Merging Data
We create another data set with weight information for some of the subjects:
sample(adsl$SUBJID, 100, replace = FALSE)
SUBJID <- sample(50:100, 100, replace = TRUE)
WEIGHT <-
data.frame(SUBJID, WEIGHT) adsl_weight <-
Now, we would like to join the weight column to our adsl data set.
In SAS, we use the merge function:
* inner join;
data adsl_merge;
merge adsl(in = inadsl) adsl_weight(in = inweight);
by subjidn;
if inadsl and inweight; * inner join in SAS
run;
* outer join;
data adsl_merge_all;
merge adsl(in = inadsl) adsl_weight(in = inweight);
by subjidn;
if inadsl or inweight; * default in SAS
run;
* left join;
data adsl_merge_left;
merge adsl(in = inadsl) adsl_weight(in = inweight);
by subjidn;
if inadsl;
run;
In R, we use again the tidyverse package:
# inner join
%>%
adsl inner_join(adsl_weight, by = "SUBJID")
# outer join
%>%
adsl full_join(adsl_weight, by = "SUBJID")
# left join
%>%
adsl left_join(adsl_weight, by = "SUBJID")
3.1.10 Concatenating Data
To demonstrate stacking or concatenating data sets, we can first split adsl into two parts, each having 5 records with the same columns in common.
# first 5 records of adsl
1 <- adsl[c(1:5),]
adsl_
# the next 5 records of adsl
2 <- adsl[c(6:10),] adsl_
In SAS, you could use the set command:
DATA adsl_stacked;
SET adsl_1 adsl_2;
RUN;
In R, you may accomplish this like so:
# tidyverse method - bind_rows()
adsl_1 %>%
adsl_stacked <- bind_rows(adsl_2)