Wednesday, November 4, 2015

Advance SAS Certificate Code

/**************************** Advance SAS Test**************************/
* Options Compress=Yes; * |NO;
%let obs_date='20JAN2001'd;
%let projroot = E:\ChenyxFiles\2014.Study.TrainingCertificate\SAS.Victoria\SAS.Exam;
%let projfolder = SAS.Advance.Code;
%let prjPath = &projroot\&projfolder;
libname sWire  "&prjPath\sasdata\wireless";
libname sRetail   "&prjPath\sasdata\retail";
libname sales "&prjPath\sasdata\retail";
%let macroPath = E:\ChenyxFiles\2014.Study.TrainingCertificate\SAS.Victoria\SAS.Exam\SAS.Advance.Code\sasmacro;
%include "&prjPath\sassyntax\control.sas";
%let today='20JAN2001'd;

data orderline;
set sales.orderline;
run;

proc contents data=sales._all_ nods;
run;


/*when learn: sas workflow*/
/*project background-business->data->manupulate->analysis->report->story->convince people*/

/*when test:  material->*/
/*when prepare: test material->transfer/used for project programming/implementation.*/
/**/


/*the SAS Adv Test will based on this project*/
/*the macro write to projet TeleComCustomerForcast*/
/*the report*/
/*the data*/
/*the export all to this project*/

proc options group=(performance memory);
run;


proc options;
run;

/*1. explore the dataset*/
/*proc sort data=wireless out=wireless nodupkey dupout=dupout;*/

/*

Group=PERFORMANCE
 ARMAGENT=         ARM Agent to use to collect ARM records
 ARMLOC=ARMLOG.LOG Identify location where ARM records are to be written
 ARMSUBSYS=(ARM_NONE)
                   Enable/Disable ARMing of SAS subsystems
 BUFNO=1           Number of buffers for each SAS data set
 BUFSIZE=0         Size of buffer for page of SAS data set
 CGOPTIMIZE=3      Control code generation optimization
 CMPMODEL=BOTH     Identify CMP model storage type
 CMPOPT=(NOEXTRAMATH NOMISSCHECK NOPRECISE NOGUARDCHECK NOGENSYMNAMES NOFUNCDIFFERENCING)
                   Enable SAS compiler performance optimizations
 COMPRESS=NO       Specifies whether to compress observations in output SAS data sets
 CPUCOUNT=4        Number of processors available.
 NODBIDIRECTEXEC   Do not use SQL optimization with SAS/ACCESS engines
 DBSLICEPARM=(THREADED_APPS, 2)
                   Alter DBMS engine threaded read behavior by expanding or disallowing
                   threaded reads.
 MAXSEGRATIO=75    SPD Engine pre-evaluation phase time ratio
 MEXECSIZE=65536   Maximum size for a macro to execute in memory
 MINPARTSIZE=16777216
                   Minimum partition size when creating SPD Engine files
 SORTSIZE=268435456
                   Size parameter for sort
 SPDEINDEXSORTSIZE=33554432
                   Identifies memory to be used for SPD Engine asynchronous index create or
                   append
 SPDEMAXTHREADS=0  Maximum number of threads for SPD Engine processing
 SPDESORTSIZE=33554432
                   Memory for SPD Engine sort operations
 SPDEUTILLOC=      Location where SPD Engine temporary utility files are created
 SPDEWHEVAL=COST   Specifies SPD Engine WHERE statement evaluation strategy
 SQLGENERATION=(NONE DBMS='TERADATA DB2 ORACLE NETEZZA')
                   Identify when to perform SQL generation for SAS Procedure input processing
 SQLREDUCEPUT=DBMS PROC SQL optimization for put() functions
 SQLREDUCEPUTOBS=0 PROC SQL optimization for put() functions observation threshold
 SQLREDUCEPUTVALUES=0
                   PROC SQL optimization for put() functions value threshold
 THREADS           Threads are available for use with features of the SAS System that
                   support threading
 MEMSIZE=2147483648
                   Specifies the limit on the total amount of memory to be used by the SAS
                   System

Group=MEMORY
 SORTSIZE=268435456
                   Size parameter for sort
 SUMSIZE=0         Upper limit for data-dependent memory usage during summarization
 MAXMEMQUERY=0     Maximum amount of memory returned when inquiring as to available space
 MEMBLKSZ=16777216 Size of memory blocks allocated to support MEMLIB and MEMCACHE options.
 MEMMAXSZ=2147483648
                   Maximum amount of memory allocated to support MEMLIB and MEMCACHE options.
 LOADMEMSIZE=0     Suggested memory limit for loaded SAS executables
 MEMSIZE=2147483648
                   Specifies the limit on the total amount of memory to be used by the SAS
                   System
 REALMEMSIZE=0     Limit on the total amount of real memory to be used by the SAS System
NOTE: PROCEDURE OPTIONS used (Total process time):
      real time           0.02 seconds
      user cpu time       0.00 seconds
      system cpu time     0.01 seconds
      memory              19.09k
      OS Memory           11196.00k
      Timestamp           22/06/2014 06:30:00 o'clock AM

Unix: use either the cron or the at command to schedule it to run at a
time when no others are running.


The default MEMSIZE value for SAS? in the UNIX environment is 128M,
but most likely you will find that not all of
your processes can run to completion using that value.
A more realistic MEMSIZE value for most processes is either
256M or 512M
 */


/*      real time           0.04 seconds--------- total time */
/*      user cpu time       0.01 seconds -----------only CPU time */
/*      system cpu time     0.03 seconds-----------other cpu time */
/*      memory              349.31k*/
/*      OS Memory           11196.00k*/
/*      Timestamp           22/06/2014 06:08:56 o'clock AM*/


/* OPTION REALMEMSIZE */
/*OPTION SORTSIZE=80M--> proc sort, proc sql*/
/*OPTION SUMSIZE --> proc MEANS, OLAP, REPORT, SUMMARY, SURVEYFREQ,*/
/*SURVEYLOGISTIC, SURVEYMEANS, and TABULATE procedures.*/
/*OPTION: WORK --> work dir*/




/*Page: from IO to Input buffer*/
/*BufNo=*/
/*BufferSize=*/


/*
I/O Performance Techniques
1) Read only data that is needed when using an INPUT statement.
2) Use KEEP / DROP statements or KEEP= / DROP= data set options to retain only
desired variables.
3) Use a WHERE statement or WHERE= data set option to subset rows of data.
4) Use data compression for large data sets using the COMPRESS= data set option.
5) Use indexes to optimize the retrieval of data.
6) Use the DATASETS procedure COPY statement to copy data sets with indexes.
7) Create data sets
8) Use LENGTH statements.
9) Use the OBS= and FIRSTOBS= data set options.
*/

/* Data compression:
only work on character type, efficient for long char data
*/

data wireless;
set sWire.wireless ;
run;
option fullstimer;  *  show full time cost report ;
/*http://support.sas.com/resources/papers/proceedings09/333-2009.pdf*/
proc sort data=wireless out=wireless nodupkey;
by acctno;
run;

proc sort data=wireless out=wireless nodup;
by acctno;
run;


proc contents data=wireless;
run;

data adt37_1;
set  wireless;
by notsorted actdt;
run;

proc sort data=wireless out=demo9(firstobs=3 obs=50); ;
by descending actdt;
run;

proc sort data=wireless out=demo9(obs=2) ; ;
by descending actdt;
run;

proc sql;
create table tmp as
select max(actdt) as actdt format=date9.
from wireless;
select * from tmp;
quit;

proc sort data=wireless out=demo9 ; ;
by descending actdt;
run;

proc sort data=demo9(obs=1) out=demo10 ; ;
by actdt;
run;

proc print data=demo9;
run;
proc print data=demo10;
run;

proc sort data=purchase out=_temp;
by descending visit_date;
run;

proc sort data=_temp(obs=1) out=most_recent_purchase;
by visit_date;
run;

proc sql noprint;
create table result as
select
quit;

data adt37_1;
set  wireless;
format actdt mmddyy10.;
by groupformat actdt;
run;

option MEMSIZE = 2147483648; * default :128M ;
/*compress data*/
data wireless(compress=yes);
set sWire.wireless ;
run;


data wireless;
set sWire.wirelessAll;
run;

/*1. explore the dataset*/
/*proc sort data=wireless out=wireless nodupkey dupout=dupout;*/
data wireless(compress=yes);
set sWire.wirelessAll;
run;

data wireless(compress=char);
set sWire.wirelessAll;
run;

/*After a file is compressed, the setting is*/
/*a permanent attribute of the file, when uncompressing a file, specify COMPRESS=NO*/
/*for a DATA step.*/


proc sort data=wireless out=wireless nodupkey;
by acctno;
run;

/*REUSE=NO results in less efficient usage of space if you delete or update*/
/*many observations in a SAS data set.*/

/*SAS character variables store data as 1 character per byte. A SAS character variable can*/
/*be from 1 to 32,767 bytes in length*/


/*the LENGTH statement affects the length of a numeric variable only in
the output data set. Numeric variables always have a length of 8*/
/*bytes in the program data vector and during processing.*/


/*Create Index*/
/*An index is an optional file that you can create for a SAS data file to provide direct*/
/*access to specific observations and is also analogous to the search function. */
/*For each data file, you can create one or multiple indexes. Once an index exists, SAS
treats it as part of*/
/*the data file. That is, if you add or delete observations or modify values,
the index is*/
/*automatically updated.*/


proc contents data=wireless;
run;

data wIndex(index=(acctno));
set wireless;
run;
proc contents data=wIndex; /* we can see the dataset is indexed */
run;

data wIndex;
set wIndex(keep=acctno actdt rename=(actdt=actDtNew));
run;
proc contents data=wIndex; /* we can see the dataset is not indexed
so: once a dataset is re-write, rename, update, then the index will be deleted.
*/
run;



data wIndex;
set wIndex(keep=acctno actdt rename=(acctno=actNoNew));
run;

proc contents data=wIndex; /* we can see the dataset is not indexed */
run;

data wwIndex(index=(acctno));
set wireless;
run;

data wIndex2(index=(acctno actdt));
set wireless;
run;

*The following example creates two simple indexes on the wireless data set.
The first index is named acctno, and it contains values of the acctno variable.
The second index is called actdt, and it contains unique values of the actdt variable;

data wIndex3(index=(acctno actdt/unique));      * create 2 indexes;
set wireless;
run;
/*The UNIQUE option guarantees that values for the key variable or the combination
of a composite group of variables remain unique for every observation in the data set.*/

/* create index*/
data adv1_frogs (index=(toxicity));
input species $ color $ location $ habitat $ toxicity $ endangered $;
datalines;
bullfrog green East boggy medium yes
river olive South swamps high no
sheep brown Texas pasture none no
;
/*
Compressing data set WORK.ADV1_FROGS increased size by 50.00 percent.
Compressed is 3 pages; un-compressed would require 2 pages.
*/

/*The following example creates a composite index on the Composite data set.
The index is named lethal_locales, and it contains concatenated values of the
toxicity variable and the location variable;*/
run;
data frogs (index= (lethal_locales=(toxicity location)));
input species $ color $ location $ habitat $ toxicity $ endangered $;
datalines;
bullfrog green East boggy medium yes
river olive South swamps high no
sheep brown Texas pasture none no
;

proc datasets lib=sasuser;
contents data=_all_;
quit;


proc datasets lib=sasuser;
contents data=_all_ nods;
quit;

proc contents data=_all_;
run;
proc datasets lib=WORK nolist;
 modify frogs;
 rename species=cat;
 format species $20. ; * visdate fromdate mmddyy10.;
 * informat todate visdate fromdate mmddyy10.;
quit;



proc datasets lib=WORK nolist;
 modify frogs;
 format cat $20. ; * visdate fromdate mmddyy10.;
 * informat todate visdate fromdate mmddyy10.;
quit;

/*MODIFY was successful for WORK.FROGS.DATA.*/
proc contents data=frogs;
run;
proc datasets lib=WORK nolist;
 modify frogs;
 rename cat=species;
 format species $20. ; * visdate fromdate mmddyy10.;
 * informat todate visdate fromdate mmddyy10.;
quit;

proc contents data=adv1_frogs;
run;

proc datasets nolist;
modify adv1_frogs;
index create toxicity;
run;
proc datasets ;
modify adv1_frogs;
index create location;
run;

proc datasets library=sashelp;
run;


proc datasets nolist;
modify adv1_frogs;
index create lethal_locales = (toxicity location);
run;


proc datasets ;
modify adv1_frogs;
index create lethal_locales1 = (toxicity species);
run;

proc datasets nolist;
modify wireless;
index create actdt_no = (actdt acctno);
run;


proc datasets nolist;
modify wireless;
index create acctno / unique nomiss;
index create actdt_no = (actdt acctno) / nomiss;
run;

/*NOMISS excludes from the index all observations with missing values for all
index variables. UNIQUE specifies that the combination of values of the index
variables must be unique.*/

proc datasets nolist;
modify wireless;
index delete acctno;
index delete actdt_no;
run;

proc sql;
create index actdt on wireless(actdt);
/*create index indexName on wireless(keyword);*/
quit;

proc sql;
select * from wireless where monotonic() between 1 and 2;
quit;
proc print data=wireless;
by actdt ;
run;

proc sql;
create index actno_dt on wireless(actdt, acctno);
/*create index indexName on wireless(keyword);*/
quit;


proc sql;
create unique index actno_dt2 on wireless(actdt, acctno);
/*create index indexName on wireless(keyword);*/
quit;

*********** Specify The Index Name in visit dataset ********************;
/*idname = indexKeyW*/
data adv_3_wireless;
set wireless(idxname=acctno); /* tell sas use this index name */
where acctno like "11769%";
run;

/*idwhere = yes|no*/
data adv_3_wireless;
set wireless(idwhere=no); /* tell sas where clause do not use index */
where acctno like "11769%";
run;

proc print data=wireless(obs=2);
run;
proc contents data=wireless;
run;
/****************** create a merged large dataset using "key= keywords"*************/
data book_of_frogs;
set frogs;
set pop_frogs key=species;  /* the second specify the key index for both */
run;

/*
The _IORC_ Variable
When you use the KEY= option, SAS creates an automatic variable named _IORC_,
which stands for INPUT/OUTPUT Return Code. You can use _IORC_ to determine
whether the index search was successful. If the value of _IORC_ is zero, SAS found a
matching observation. If the value of _IORC_ is not zero, SAS did not find a matching
observation.
*/

data work.profit3 work.errors; /* use key= to combine data */
set sasuser.dnunder;
set sasuser.sale2000(keep=routeid flightid date rev1st revbusiness revecon revcargo)
key=flightdate;
if _iorc_=0 then  /* _iorc_ ==== input output return code */
do;
Profit=sum(rev1st, revbusiness, revecon, revcargo,-expenses);
output work.profit3;
end;
else
do;
_error_=0;
output work.errors;
end;
run;


proc contents data=sRetail._all_;
run;

data sRetail.orders(index=(orderid));
set SRETAIL.orders;
run;

data sRetail.orderline(index=(orderid));
set sRetail.orderline;
run;

proc sort data=sRetail.orderline force;   /* force sort */
* indexed file can not be sort, if force sort, the index will lose ;
by orderid;
run;
data adv2_orders;
set SRETAIL.ORDERS ;
set SRETAIL.ORDERLINE key=orderid; * same as  a merge ;
run;


proc contents data=SRETAIL.ORDERS;
run;
proc contents data=adv2_orders;
run;


/*The SAS System may use an index when processing the following:
WHERE expression
KEY= in the SET
SCL table lookup
SQL join queries
BY processing

*/
/*Using BY-Group Processing with an Index*/
/*using BY-group processing with an index has two disadvantages:*/
/*
-It is generally less efficient than sequentially reading a sorted data set
because processing BY groups typically means retrieving the entire file.
-It requires storage space for the index.*/
/*
if the BY statement includes the DESCENDING or
NOTSORTED option or
if SAS detects that the data file is physically stored in sorted order on the BY variables.
then BY statement does not use an index
*/
/****************************************************/
data orders;
set sRetail.Orders;
run;

proc contents data=orders;
run;

/*
DATA SAS-data-set;
MODIFY SAS-data-set;
existing-variable = expression;
RUN;
*/
data orders;
modify orders;
city = substr(city, 1,2);  /* modify all the data in a dataset  */
totalprice = totalprice * 0.9;
run;

proc print data=orders (keep=city totalprice obs=50);
run;

/*  Modifying Observations Using a Transaction Data Set
DATA SAS-data-set;
MODIFY SAS-data-set transaction-data-set;
BY key-variable;
RUN;
The matching observation in the master data set can be replaced,
deleted, or appended. By default, the observation is replaced.
*/
data Orders(index=(orderid));   /* first sort data, then using data plus "By" statement, which is index */
set Orders;
by orderId;
run;
proc contents data=orders;
run;
/****************************************************/
proc sort data=sRetail.Orders out=orders;
by orderid;  /* before merge, need sort */
run;

data Orders;   /* first sort data, then using data plus "By" statement, which is index */
set Orders;
by orderId;
run;

proc contents data=orders;
run;
proc datasets;
modify orders;
index create orderid;
run;


proc contents data=orders;
run;
data Orders;   /* first sort data, then using data plus "By" statement, which is index */
set Orders;
by orderId notsorted ;  * do not require data been sorted;
run;

/*connect two file by filename*/
filename num1 ('&prjPath\f1.txt' '&prjPath\f2.txt');

data num;
infile num1;
input school grade num;
run;

PROC FSLIST FILE="&prjPath\sasimport\medium.txt";
RUN;


option MSGLEVEL=i; * |N  ;  /* i = information */
/* message level , to determine whether SAS is using an index*/

proc print data=orderline;
where billdate = '20Apr2010'd;
run;
proc contents data=orderline;
run;
/*FSEDIT procedure*/

/*data adv2_orders2;*/
/*set SRETAIL.ORDERS SRETAIL.ORDERLINE key=orderid; * not work  ;*/
/*run;*/


/*The SAS System may use an index when processing the following:
WHERE expression
KEY= in the SET
SCL table lookup
SQL join queries
BY processing

*/

/*Version 6, the APPEND procedure updated the index of a data*/
/*file as new observations were added, creating slow performance,*/
/*most especially for large data files.*/
/*Sorting your data before appending will reduce the index*/
/*update overhead.*/

********************Tips of using Index ***************************;
/*
If your data file is small, sequential processing is usually just
as fast or faster. If your page count (available from the
CONTENTS procedure) is less than three pages, do not use
an index; it will degrade your performance.
Consider the cost of the index if the data file is frequently
changed. An index is automatically updated when the data
file is updated, requiring additional resources.
If the subset of data for the index is not small, it may require
more resources to access the data than sequential access.
Conversely, when you intend to retrieve a small subset of
observations from a large data file, an index will most likely
be more efficient. The smaller the subset, the greater the
performance gains.
Consider your data access needs. An index must be used
often in order to make up for the resources consumed when
creating and maintaining it.
Do not use more indexes than you actually need. Find the
most discriminating variables in commonly used queries and
use them as your key variables.

For better index performance, sort your data file
into ascending order on the key variable before you index the
data file. If appending data to the indexed file, sort the data you
are appending before executing the APPEND procedure.

Sort an indexed data file, then losing the index

Percentage of Observations/ Performance Gains
1-10 excellent
11-25 good
26-50 marginal
51-100 poor
*/


/* 6. Sampling Data*/
data adv6_rand;
seed = 600;
random = uniform (seed);
put random= ;
run;

************************************* Retail Start ************************;


*merge orderline, product, orders, campaign, customer into one table;
proc sort data=sales.orderline out=orderline;
by productid;
run;
proc sort data=sales.product out=product;
by productid;
run;
data withgroup;
merge orderline product;
by productid;
run;

/*
MERGING vs. JOINING: Comparing the DATA Step with SQL
http://www2.sas.com/proceedings/sugi30/249-30.pdf
OPTIONS MERGENOBY=WARN MSLEVEL=I;
 DATA ONEs TWOs inBOTH
 NOmatch1 NOmatch2 allRECS NOmatch;
 MERGE ONE(IN=In1) TWO(IN=In2);
 BY ID;
 IF In1=1 then output ONEs;
 IF In2=1 then output TWOs;
 IF (In1=1 and In2=1) then output inBOTH;
 IF (In1=0 and In2=1) then output NOmatch1;
 IF (In1=1 and In2=0) then output NOmatch2;
 IF (In1=1 OR In2=1) then output allRECS;
 IF (In1+In2)=1 then output NOmatch;
 RUN;
*/

proc sort data=withgroup;
by orderid;
run;
proc sort data=sales.orders out=orders;
by orderid;
run;

data orderandline;
merge orders withgroup (rename=(totalprice=subtotal));
by orderid;
run;

proc sort data=orderandline;
by campaignid;
run;
proc sort data=sales.campaign out=campaign;
by campaignid;
run;

data orderandline1;
merge orderandline campaign;
by campaignid;
run;

proc sort data=orderandline1;
by customerid;
run;
proc sort data=sales.customer out=customer;
by customerid;
run;

data full;
merge customer orderandline1;
by customerid;
run;

*************** save full data for future use ***********;
data sdata.full;
set full;
run;

**************** call back full data ******************;
data full;
set sdata.full;
run;



proc sql;
select min(orderdate) format=date9., max(orderdate) format=date9.
from full;
quit;

/*6. **************************Sampling: ***********/
/*Equal probability sample: */
/*1. Simple Random Sampling With Replacement (a.k.a. Unrestricted Random*/
/*Sampling), */
/*2. Simple Random Sampling Without Replacement,*/
/*3. Bernoulli Sampling, */
/*4. Systematic Sampling,
5. and Sequential Sampling.*/
proc contents data=sales._all_;
run;

proc contents data=sales.orders;
run;
Data adv6_random(drop=rand_ seed i total_rec);
total_rec = 192983;
sample_size = 100000;
seed=total_rec ;
/* from 1 to 2,147,483,646, seeds is random, a specific seeds get a fix rand number */
do i = 1 to sample_size;
rand_ = uniform (seed);
rand_id = round(total_rec * rand_);
IF I<4 THEN DO;
PUT "RECORD = "I 'RANDOM NUMBER = ' RAND_;
END;
output;
end;
run;

/*6.1 get 10 random sample*/
data orders;
set sales.orders;
id = _N_;
run;
data adv6_rand_1;
set adv6_random(obs=100000);
run;

proc sql;
create table adv6_sample_100 as
select * from orders
where id in (select rand_id from adv6_rand_1);
* select id from adv6_sample_100;
quit;
run;
/* real time           1.04 seconds*/
/* cpu time            1.03 seconds*/


/* random sample using point= statement */
data adv_samp2;
rand_id = 6;
popu_size = 10;
set orders point=rand_id nobs=popu_size;  /* use point= statement to select sub-set data*/
output;
stop;
run;

data adv_samp2;
rand_id = 6;
popu_size = 100;
set orders point=rand_id nobs=popu_size;  /* use point= statement to select sub-set data*/
output;
stop;
run;



data adv_samp2;
rand_id = 6;
set orders point=rand_id;  /* use point= statement to select sub-set data*/
output;
stop;
run;

data adv_samp3;
seed = 900;
popu_size = 100;
do i=1 to 30000;
rand_id = ceil(popu_size*ranuni(seed));
set orders point=rand_id nobs=popu_size;  /* use point= statement to select sub-set data*/
drop seed i popu_size rand_id;
output;
end;
stop;
run;
/* real time           0.06 seconds*/
data adv_samp4;
seed = 900;
popu_size = 100000;
do i=1 to 30000;
rand_id = ceil(popu_size*ranuni(seed));
set orders point=rand_id nobs=popu_size;  /* use point= statement to select sub-set data*/
drop seed i popu_size rand_id;
output;
end;
stop;
run;
/* real time           1.28 seconds*/
/* cpu time            1.24 seconds*/

%let sample_size=5000;
%let popu_size = 192983;
/*Bernoulli Sampling: give random sample size */
DATA adv6_samp5 ;

SET orders NOBS=popu_size;
SamplingProbability=&SAMPLE_SIZE/popu_size;
SELECTED=0;
SamplingWeight=0;
SEED=12345;
IF UNIFORM(SEED) =< SamplingProbability THEN
DO;
SamplingWeight=1/SamplingProbability; *sample weight  = Popu_size/sample_size;
SELECTED=1;
OUTPUT; * because it is already selected, then Bernoulli=1;
END;
/*OUTPUT FRAME_FINAL;*/
RUN;


%let sample_size=50;
%let popu_size = 13;
/*NOBS*/
DATA adv6_samp5 ;
SET orders NOBS=popu_size;
if _N_ = 192983 then put popu_size=;
RUN;
/*The NOBS = option creates a variable which contains the total number
of observations in the input data set(s).
If multiple data sets are listed in the SET statement,
the value in the NOBS = variable are the total number of observations
in all the listed data sets.*/
/*SIMPLE RANDOM SAMPLING WITH REPLACEMENT*/

DATA adv6_samp5 ;
SET orders NOBS=popu_size;
call symput('pop_size', popu_size);
put popu_size=;
stop;
RUN;
%put &pop_size=;


DATA adv6_samp5 ;
SET orders(OBS=100);
RUN;


DATA adv6_samp5 ;
SET orders(firstobs=20 OBS=100);
RUN;

%let sample_size=5000;
%let popu_size = 192983;
DATA adv6_samp_withReplace ;
SEED=123466;
DO samp_cnt=1 TO &sample_size;
SELECT=CEIL( UNIFORM(SEED) * popu_size );
SET orders POINT=SELECT NOBS=popu_size;  /* random select */
SAMPLINGPROBABILITY=&SAMPLE_SIZE/popu_size;
SamplingWeight=1/SamplingProbability;
output;
end;
stop;
run;

/*detect duplicate sample */
proc sort data=adv6_samp_withReplace out=adv6_samp6 nodup;  * nodup=detect duplicate observation;
by customerid;
run;

proc sort data=adv6_samp_withReplace out=adv6_samp6 nodupkey;
by customerid;
run;
proc sort data =adv6_samp_withReplace ;
by customerid;
run;




proc freq data =adv6_samp_withReplace ;
tables customerid*campaignid;
run;


/*remove duplicate records example */

proc sort data=orderline;
by billdate;
run;
/*then we want to choose only one record in each transaction day and remove the others*/
DATA adv6_samp8;
SET orderline;
RETAIN Hits;
by billdate;
if first.billdate=1 then Hits=0;
Hits+1;
if last.billdate=1 then
do;
NumberHits=HITS;
output; * only output the last record ;
end;
run;
/*we want to know the first billdate and last billdate*/

/* ******* SIMPLE RANDOM SAMPLING WITHOUT REPLACEMENT ***
  each sample have only one time chance been selected
*/
%let sample_size=5000;
DATA adv6_samp10; * ????????????????????? sample without replacement ;
SEED=123;
COUNT=0;
SamplingProbability=&SAMPLE_SIZE/popu_size;
SamplingWeight = popu_size/&Sample_Size;
ARRAY SEL_OBS{&SAMPLE_SIZE} _temporary_ ;
/*Temporary data element values are always automatically retained, rather than
being reset to missing at the beginning of the next iteration of the DATA step.*/
DO K=1 TO &SAMPLE_SIZE;
ReDo: SELECT=CEIL(UNIFORM(SEED)*Popu_size);
DO SAMPL_CNT=1 TO COUNT;
IF SEL_OBS(SAMPL_CNT)=SELECT then GOTO ReDo;
END;0
SET orderline POINT=SELECT NOBS=popu_size;
COUNT=COUNT+1;
SEL_OBS(COUNT)=SELECT;
OUTPUT;
END;
STOP;
RUN;

/*use proc sort to do simple sample without replacement(huifang)*
Method 1:
step 1. bind a random number to existed dataset
 step 2. sort the dataset by random number
 step 3. sub-set the first sample-sized record
*/
%let seed = 12345;
%let pop_size= 192983;
%let Sample_Size = 50;
DATA adv6_samp11;
SET orderline;
rand_= ranuni(&seed) * &pop_size;
run;

* step 2 ;
proc sort data=adv6_samp11 out=adv6_samp12;
by rand_;run;

* step 3 ;
data adv6_samp13;
set adv6_samp12(obs=&Sample_Size); *NOBS=&pop_size;
* SamplingWeight= &Sample_Size/(&pop_size);
run;

/*use proc sort to do simple sample without replacement(huifang)*
Method 2: Using Base SAS and PROC SURVEYSELECT.pdf, by David.D
*/

/* use ranks to get random dataset */
/*DATA step views are stored, compiled DATA step programs.
data view stores only the instructions for where to find the data
and how it is formatted, not the actual data ; Views can ensure that the input data sets
are always current because data is derived from views at execution time
*/
data adv6_12b/view=adv6_12b;   /* DATA STEP view saved on file WORK.ADV6_12B */
* create a view, instead of the real dataset, save workspace, save computation resource;
* about view: http://www2.sas.com/proceedings/sugi29/067-29.pdf;
set orderline;
seed=12345;
random=uniform(seed); * add a random var to existed dataset;
run;


data adv6_12b adv6_12c /view=adv6_12b;   /* DATA STEP view saved on file WORK.ADV6_12B */
/*the view name must equal to dataset name, because view is saved on existed data set file*/
* create a view, instead of the real dataset, save workspace, save computation resource;
* about view: http://www2.sas.com/proceedings/sugi29/067-29.pdf;
set orderline;
if _N_ = 1 then output adv6_12c;
else
do;
seed=12345;
random=uniform(seed); * add a random var to existed dataset;
output adv6_12b;
end;
run;

data adv6_12d /view=adv6_12d;   /* DATA STEP view saved on file WORK.ADV6_12B */
/*to the new created dataset 'adv6_12d', first create dataset, then create view*/
set orderline;
run;


data adv6_12e /view=adv6_12f;   /* DATA STEP view saved on file WORK.ADV6_12B */
/*error: The requested type of view (Input or Output) cannot be determined*/
set orderline;
run;



data WORK.TEMP WORK.ERRORS /view=WORK.TEMP;   /* DATA STEP view saved on file WORK.ADV6_12B */
set orderline;
if _N_ = 1 then output WORK.ERRORS;
else
output WORK.TEMP;
run;


/*
DATA STEP view saved on file WORK.ADV6_12B.
NOTE: A stored DATA STEP view cannot run under a different operating system
*/

data adv6_12e;
input a;
datalines;
2
4
;
run;
data adv6_12d;
input a;
datalines;
2
4
;
run;


data adv6_12f/view=adv6_12f;
set adv6_12e;  /*create view based on existed dataset*/
run;
/* ties:
When two records have identically the same value for a variable, PROC RANK
considers this a tie. Tied records are assigned the same value for
a rank. When dealing with random number, it is unlikely there will
be a tie; however, to be safe you should specify 揟IES=HIGH.?This
will assign the highest rank to all tied values and the number of
records will be greater than the sample size. */

/* Fig 1  : Proc Rank Example

Obs spend

 1 8
 2 4
 3 1
 4 7
 5 3
 6 5
 7 2
 8 6

Fig 2

Obs spend r_spend

 1 122345 8
 2 235 4
 3 12 1
 4 5677 7
 5 214 3
 6 432 5
 7 121 2
 8 1567 6  */

data adv6_rank1;
input spend @@;
datalines;
122345 235 12 5677 214 432 121 1567
;
run;
proc rank data=adv6_rank1 out=adv6_rank2;
var spend;
run;

proc rank data=adv6_rank1 out=adv6_rank2;
var spend;
ranks r_spend;
run;

proc print data=adv6_rank2;
run;

%let sample_size=5000;
%let popu_size = 192983;
data adv6_12;
set orderline;
seed=12345;
random=uniform(seed); * add a random var to existed dataset;
run;

/* Sampling: get random sample by proc rank*/
proc rank data=adv6_12 TIES=HIGH out=adv6_samp14(where=(rank_by_rand <= &sample_size));
var random;
ranks rank_by_rand;
* this equals to sort dataset by ascending order, then get the first N observation;
run;

proc sort data=adv6_samp14 out=adv6_samp15;
by rank_by_rand;
run;

/*grouping by proc rank:
Get deciles, quartiles, percentiles
Deciles --- GROUPS=10,
quartiles --- GROUPS=4,
percentiles --- GROUPS=100
*/
proc rank data=adv6_12 TIES=HIGH out=adv6_samp16 groups=4;
var random;
ranks rank_by_rand;
run;

proc sort data=adv6_samp16 out=adv6_samp16;
by rank_by_rand;
run;

/* get random data */
DATA adv6_samp17;
SET adv6_samp14(obs=&sample_size);
SamplingWeight=&popu_size/&sample_size;
run;

/* PROC SURVEYSELECT : deal with sampling method
Equal Probability              Unequal Probability
Unrestricted Random
Sampling
PPS Sampling
With Replacement
Simple Random
Sample Without
Replacement
PPS Sampling
Without Replacement
Systematic Sampling PPS Systematic Sampling
Sequential Sampling PPS Sequential Sampling
Brewer's PPS Method

PROC SURVEYSELECT options;
STRATA variable(s) --input file had to be sorted by strata in ascending order
CONTROL variable(s)--specify additional variables (other than strata) to sort by
when performing, example :CONTROL AGE_COHORT GENDER RACE;

SIZE variable ;
ID variable-- specify variables from the input file or sampling frame to carry
into the output file.
RUN;
*/
/*unique sample: Unrestricted Random Sampling */

/*
SRS --
Strata --sampsize(345|smpsize var|smpsize dataset)
pps--- size(size var)
*/

/*Strata*/
/*1.get dataset, subset*/
data channels;
set sales.timespan_g(keep=channel paymenttype cost Npurchase);
run;

proc contents data=channels;
run;

/*2. strata --> built sampsize_var*/
proc freq data=channels; *take a tour to data;
tables channel paymenttype;
run;
/*method 1: each channel have some sample, e.g. if freq(>300, 20, 10),
then sampling 50 else sampling 3  strata_var=regroup*/
data chn_m1(drop=gt10k gt1k lt1k);
set channels;
gt10k = 1; gt1k=2; lt1k=3;
select(trim(channel));
when ('AD') chgrp=gt10k;
when ('BULK') chgrp=lt1k;
when ('CATALOG') chgrp=lt1k;
when ('CONFERENCE') chgrp=lt1k;
when ('EMAIL') chgrp=lt1k;
when ('EMPLOYEE') chgrp=lt1k;
when ('INSERT') chgrp=gt1k;
when ('INTERNAL') chgrp=lt1k;
when ('MAIL') chgrp=gt1k;
when ('PARTNER') chgrp=gt10k;
when ('REFERRAL') chgrp=gt10k;
when ('SURVEY') chgrp=gt10k;
when ('WEB') chgrp=lt1k;
otherwise  chgrp=gt10k;
end;
select(chgrp);
when(gt10k) _NSIZE_=300;
when(gt1k) _NSIZE_=5;
when(lt1k) _NSIZE_=30;
end;
run;
proc freq data=chn_m1;
tables chgrp _NSIZE_;
run;

data smps_size; /* build stata size var: column1=strata_var, column2=size_var */
set chn_m1(keep=chgrp _NSIZE_);
run;
proc sort data=smps_size;
by chgrp;  * smp_size need sorted before using surveyselect;
run;

proc freq data=chn_m1;
tables chgrp _NSIZE_;
run;
/*method 2: each channel equal: 1, stata_var=channel*/
/*method 3: group data by 3 group: freq > 10000, >1000, < 1000, then sampsize_var=50,20,20, strata_var=group*/

/*3. sort by strata var*/
proc sort data=chn_m1 out=chn_m2;
by chgrp channel;
run;


/*4. surveyselect (sampsize=sampsize_var  strata=stata_var)*/
proc surveyselect data=chn_m2 sampsize=smps_size  out=chn_m3;
strata chgrp ;
run;
/*statement: n=  equals to sampsize= */
proc surveyselect data=chn_m2 n=smps_size  out=chn_m3;
strata chgrp ;
run;


proc print data=chn_m3;
by chgrp;
run;

proc print data=chn_m3;
by chgrp channel;
run;

/*method 1.1 use a constant as sampsize*/
/*step 1: buid dataset,*/
data chn_m5(drop=gt10k gt1k lt1k);
set channels;
gt10k = 1; gt1k=2; lt1k=3;
select(trim(channel));
when ('AD') chgrp=gt10k;
when ('BULK') chgrp=lt1k;
when ('CATALOG') chgrp=lt1k;
when ('CONFERENCE') chgrp=lt1k;
when ('EMAIL') chgrp=lt1k;
when ('EMPLOYEE') chgrp=lt1k;
when ('INSERT') chgrp=gt1k;
when ('INTERNAL') chgrp=lt1k;
when ('MAIL') chgrp=gt1k;
when ('PARTNER') chgrp=gt10k;
when ('REFERRAL') chgrp=gt10k;
when ('SURVEY') chgrp=gt10k;
when ('WEB') chgrp=lt1k;
otherwise  chgrp=gt10k;
end;
run;

/*step 2: sort by strata var*/
proc sort data=chn_m5;
by chgrp;
run;

/*step 3: use surveyselect*/
proc surveyselect data=chn_m5 sampsize=100 out=chn_m6;
strata chgrp;
run;

/*n=100 equals sampsize=100*/
proc surveyselect data=chn_m5 n=100 out=chn_m6;
strata chgrp;
run;

/*using more than one strata variables*/
/*1. build strata vars, built sampsize dataset*/
/*   chgrp, paygrp using derived group, so if need  two, then combine as one or something else*/
data chn_m5(drop=gt10k gt1k lt1k chgrp paytp);
set channels;
gt10k = 'g10k'; gt1k='g1k'; lt1k='l1k';
select(trim(channel));
when ('AD') chgrp=gt10k;
when ('BULK') chgrp=lt1k;
when ('CATALOG') chgrp=lt1k;
when ('CONFERENCE') chgrp=lt1k;
when ('EMAIL') chgrp=lt1k;
when ('EMPLOYEE') chgrp=lt1k;
when ('INSERT') chgrp=gt1k;
when ('INTERNAL') chgrp=lt1k;
when ('MAIL') chgrp=gt1k;
when ('PARTNER') chgrp=gt10k;
when ('REFERRAL') chgrp=gt10k;
when ('SURVEY') chgrp=gt10k;
when ('WEB') chgrp=lt1k;
otherwise  chgrp=gt10k;
end;
if (trim(paymenttype)) in('??') then  paytp = 'Cash  ';
if (trim(paymenttype)) in('AE', 'DB','OC') then  paytp = 'Other';
if (trim(paymenttype)) in('MC', 'VI') then  paytp = 'Credit';
grp_paytp = chgrp || '-' || paytp;    ***  build a new variable by two variable;
run;

/*2. sort by strata vars*/
proc sort data=chn_m5 out=chn_m6;
by grp_paytp;
run;

proc freq data=chn_m6;
tables grp_paytp;
run;

data sampsize_grp(drop=smpsize);
input grp_paytp $12. smpsize _NSIZE_ ;
datalines;
g10k-Cash 248 20
g10k-Credit 63742 1000
g10k-Other 39302 1000
g1k -Cash 3 3
g1k -Credit 5414 50
g1k -Other 2764 50
l1k -Cash 18 5
l1k -Credit 31604 1000
l1k -Other 13163 1000
;
run;
proc sort data=sampsize_grp;
by grp_paytp;
run;
proc print data=sampsize_grp;
run;


/*3. use simple sampsize option, each strata has seem samples*/
proc surveyselect data=chn_m6 out=chn_m7 sampsize=sampsize_grp;
strata grp_paytp;
run;

proc surveyselect data=chn_m6 out=chn_m8 sampsize=sampsize_grp method=srs;
strata grp_paytp;
run;

proc surveyselect data=chn_m6 out=chn_m8 sampsize=sampsize_grp method=sys;
strata grp_paytp;
run;


proc surveyselect data=chn_m6 out=chn_m8 method=sys rate=.2;
/*  SYS sample: systematic random sampling.
rate= statement each strata has 2% rate of samples*/
strata grp_paytp;
run;

/*unrestric sample: with replacement, some duplicate records*/
proc surveyselect data=chn_m6 out=chn_m8 method=URS rate=.2;
/* Unrestricted Random Sampling*/
/*rate= statement each strata has 2% rate of samples*/
/* WORK.CHN_M8 has 28195 observations*/
strata grp_paytp;
run;


proc surveyselect data=chn_m6 out=chn_m8 method=URS rate=.02;
/*rate= statement each strata has 2% rate of samples*/
/* WORK.CHN_M8 has 3104 observations*/
strata grp_paytp;
run;
proc surveyselect data=chn_m6 out=chn_m8 method=URS n=3;
/* WORK.CHN_M8 has 27 observations*/
strata grp_paytp;
run;

proc surveyselect data=chn_m6 out=chn_m8 method=URS sampsize=3;
/* WORK.CHN_M8 has 27 observations*/
strata grp_paytp;
run;

proc surveyselect data=chn_m6 out=chn_m8 method=URS sampsize=3 rep=1;
/* WORK.CHN_M8 has 27 observations*/
strata grp_paytp;
run;

/*PPS*/
/*1. subset data*/
/*2. build size_var of each group*/

/*3. surveyselect (sampsize= n= size=size_var)*/

proc contents DATA=orderline ;
run;
proc sort data=orderline;
by productid;
run;
/*simple random sample*/
PROC SURVEYSELECT  DATA=orderline seed=12345
METHOD=SRS OUT=adv6_19 sampsize=500;
* strata  productid;  /* */
RUN;

PROC SURVEYSELECT  DATA=orderline seed=12345
METHOD=SRS OUT=adv6_19 sampsize=5;
RUN;
proc print data=adv6_19;
run;

/*Systematic Sampling: taking items at fixed intervals
(such as every fifth item from a list with random starting point)*/
/*In equal probability sampling, each unit in the sampling frame,
or in a stratum, has the same probability of being selected for the sample.
In PPS sampling, a unit's selection probability is proportional to its size measure.*/

PROC SURVEYSELECT  DATA=orderline seed=12345 METHOD=URS OUT=adv6_19 SAMPSIZE=5000;
strata  productid;  /* */
RUN;
/* duplicate sample */
PROC SURVEYSELECT  DATA=orderline seed=12345 METHOD=URS OUT=adv6_19 OUTHITS
                   SAMPSIZE=5000;
strata  productid;
RUN;


data adv6_22;
do i =1 to 1000;
output;
end;
run;
/*using PROC SURVEYSELECT-Simple Random Sampling, without replacement */;
proc surveyselect data=adv6_22 method=srs n=100 out=adv6_22b;
run;
proc surveyselect data=adv6_22 method=srs n=100 out=adv6_22b;
run;
proc surveyselect data=adv6_22 method=srs seed=12345 n=100 out=adv6_22b;
run;

/*using PROC SURVEYSELECT-Stratified Sampling*/;

proc freq data=orderline;
tables productid;
run;
proc contents data=orderline;
run;
proc format ;
value prdType
14019-14030='TypeA'
14031-14040='TypeB'
other = 'TypeC'
;
run;

libname library 'c:\sas\newfmts';
proc format lib=library;
value $routes
'Route1' = 'Zone 1'
'Route2' - 'Route4' = 'Zone 2'
'Route5' - 'Route7' = 'Zone 3'
' ' = 'Missing'
other = 'Unknown';
value $dest
'AKL','AMS','ARN','ATH','BKK','BRU',
'CBR','CCU','CDG','CPH','CPT','DEL',
'DXB','FBU','FCO','FRA','GLA','GVA',
'HEL','HKG','HND','JED','JNB','JRS',
'LHR','LIS','MAD','NBO','PEK','PRG',
'SIN','SYD','VIE','WLG' = 'International'
'ANC','BHM','BNA','BOS','DFW','HNL',
'IAD','IND','JFK','LAX','MCI','MIA',
'MSY','ORD','PWM','RDU','SEA','SFO' = 'Domestic';
value revfmt
. = 'Missing'
low - 10000 = 'Up to $10,000'
10000 <- 20000 = '$10,000+ to $20,000'
20000 <- 30000 = '$20,000+ to $30,000'
30000 <- 40000 = '$30,000+ to $40,000'
40000 <- 50000 = '$40,000+ to $50,000'
50000 <- 60000 = '$50,000+ to $60,000'
60000 <- HIGH = 'More than $60,000';
run;
/*deal with overlap range */
proc format;
value dates (multilabel)
'01jan2000'd - '31mar2000'd = '1st Quarter'
'01apr2000'd - '30jun2000'd = '2nd Quarter'
'01jul2000'd - '30sep2000'd = '3rd Quarter'
'01oct2000'd - '31dec2000'd = '4th Quarter'
'01jan2000'd - '30jun2000'd = 'First Half of Year'
'01jul2000'd - '31dec2000'd = 'Second Half of Year';
run;

/*the MLF option: tell SAS using multi-label, which have overlap range */
proc tabulate data = sasuser.sale2000 format = dollar15.2;
format Date dates.;
class Date / mlf;
var RevCargo;
table Date, RevCargo*(mean median);
run;

/*
Digit selectors are numeric characters (0 through 9) that define positions for numeric
values. If you use nonzero digit selectors, zeros are added to the formatted value as
needed. If you use zeros as digit selectors, no zeros are added to the formatted value.
*/
proc format;
picture rainamt
0-2='9.99 slight'
2<-4='3.33 moderate'
4<-<10='9.99 heavy'
other='999 check value';
run;
data rain;
input Amount;
datalines;
4
3.9
20
.5
6
;
run;
proc print data=rain;
format amount rainamt.;
run;


/*directive format*/
/*
%a abbreviated weekday name
%A full weekday name
%b abbreviated month name
%B full month name
%d day of the month as a number 1-31, with no leading zero
%H 24-hour clock as a number 0-23, with no leading zero
%I 12-hour clock as a number 1-12, with no leading zero
%j day of the year as a number 1-366, with no leading zero
%m month as a number 1-12, with no leading zero
%M minute as a decimal number 0-59, with no leading zero
%p AM or PM
%S second as a number 0-59, with no leading zero
%U week number of the year (Sunday is the first day of the week) as a number 0-53,
with no leading zero
%w weekday as a number (1=Sunday, to 7)
%y year without century as a number 0-99, with no leading zero
%Y year with century as a number
if need 0, add 0 in the directive so that if a one-digit numeric value is returned,
it is preceded by a 0
syntax:
proc format;
PICTURE format-name
value-or-range='directivies' (DATATYPE=SAS-date-value-type);

*/

proc format;
picture fmtDate
low-high='%0d-%b%Y' (datatype=date);
/*???????????????????? Need to build a design format layout page, then for later usage*/
run;

proc print data=orderline(keep=billdate obs=5);
format billdate fmtdate.;
run;


PROC CATALOG CATALOG=libref.catalog;
CONTENTS <OUT=SAS-data-set>;
COPY OUT=libref.catalog <options>;
SELECT entry-name.entry-type(s);
EXCLUDE entry-name.entry-type(s);
DELETE entry-name.entry-type(s);
RUN;
QUIT;

/*
When a format is referenced, SAS automatically looks through the following libraries
in this order:
?? Work.Formats
?? Library.Formats.

Option: FmtErr, NoFmtErr --> set to format defination as missing if error in format,
then data keep as the original format

Syntax: OPTIONS FMTERR | NOFMTERR;
By default, the FMTERR system option is in effect. If you use a format that SAS
cannot load, SAS issues an error message and stops processing the step. To prevent
this, you must change the system option FMTERR to NOFMTERR. When NOFMTERR
is in effect, SAS substitutes a format for the missing format and continues processing.

*/
data adv6_22d;
set orderline;
prdTypeGrp =put(productid, prdType.);
run;

proc sort data=adv6_22d;
by prdTypeGrp;
run;
proc freq data=adv6_22d;
tables prdTypeGrp;
run;
proc surveyselect data=adv6_22d method=srs n=43 seed=1953 out=adv6_22c;
strata prdTypeGrp;
id productid;
run;
proc surveyselect data=adv6_22d method=sys n=44 seed=1953 out=adv6_22c;
strata prdTypeGrp;
id productid;
control productid;
run;

Data frame(DROP=I SEED );
RECORDS=1;
DO i=1 to 50000;
id_num=put(i,$6.);
seed=123;
random=uniform(seed);
N1=normal(1123)*500 + 5000;
N2=normal(2123)*500 + 5000;
N3=normal(3123)*500 + 5000;
N4=normal(4123)*500 + 5000;
output;
end;
run;
data adv6_25;
do i=1 to 100;
n1 = normal(100);
put n1=;
end;
run;
proc means data=adv6_25;
var n1;
run;



/*PROC TRANSPOSE DATA= PREFIX= OUT= NAME= LABEL= ;*/
/*VAR variable-list; ---> the count var, the data to show in content */
/*ID variable; */
/*BY variable-list; */
/*RUN;*/

/*proc transpose data=transactions_small */
/*out=month_status(drop=_NAME_) prefix=month;*/
/*by customer_id;*/
/*id month;*/
/*var one;run;*/

/*
The original SAS data set must be sorted or indexed prior to using a BY statement
with PROC TRANSPOSE unless you use the NOTSORTED option.
*/

proc contents data=sales._all_;
run;
/*design a table shows each month each paytype, from business, each month paytpye*/
/*and payamount*/
data timespans(keep=paymenttype channel timespan);
set sales.timespan_g;
run;



/*method 1: bin --> proc freq, re-bin data*/
proc format ;
value spanRange
low-10 = 'Short'
11-60 = 'Mid'
61-high = 'Long'
;
run;
data adv7_1(drop=timespan);
set timespans;
spanName = put(timespan, spanRange.);
run;

proc print data=adv7_1(obs=200);
run;

proc freq data=adv7_1;
tables paymenttype*spanname;
run;

proc freq data=adv7_1 noprint;
tables paymenttype*spanname/norow nocol nopercent out=adv7_2(drop=percent);
run;
proc freq data=adv7_1;
tables paymenttype*spanname/norow nocol nopercent out=adv7_2;
run;

proc contents data=adv7_2;
run;

proc sort data=adv7_2;
by spanName;
run;
proc transpose data=adv7_2 out=adv7_3 prefix=payType;
by spanName;  /* must sort before use*/
id paymenttype;
var count;
run;

proc transpose data=adv7_2 out=adv7_3(drop=_Label_) prefix=payType name=payTypeNm_;
by spanName;  /* must sort before use*/
id paymenttype;
var count;
run;

proc transpose data=adv7_2 out=adv7_3(drop=_Label_) name=payTypeNm_;
by spanName;  /* must sort before use*/
var count;
run;
proc contents data=adv7_2;
run;
 /*               pay
                   span     Type
            Obs    Name      Nm_     COL1     COL2    COL3     COL4    COL5     COL6

             1     Long     COUNT      28     5243    1169     4114     436     5039
             2     Mid      COUNT      38     1976    1491     1298     168     2005
             3     Short    COUNT     203    32610    5669    33696    6467    54608
*/

proc transpose data=adv7_2 out=adv7_3(drop=_Label_) name=payTypeNm_;
by spanName;  /* must sort before use*/
var percent;
run;
/*           span       pay
    Obs    Name     TypeNm_      COL1       COL2       COL3       COL4       COL5       COL6

     1     Long     PERCENT    0.01792     3.3553    0.74812     2.6328    0.27903     3.2248
     2     Mid      PERCENT    0.02432     1.2646    0.95419     0.8307    0.10751     1.2831
     3     Short    PERCENT    0.12991    20.8693    3.62797    21.5643    4.13867    34.9473
*/

proc print data=adv7_3;
run;
proc transpose data=adv7_2 out=adv7_3(drop=_Label_) prefix=payType name=payType_;
by spanName;  /* must sort before use*/
id paymenttype;
idlabel spanName;
var count;
run;

proc print data=adv7_3;
run;

*-------------------------------------------------------;

proc contents data=orderline;
run;
proc sort data=orderline out=adv6_18a;
by productid;
run;
proc surveyselect data=adv6_18a out=adv6_18b
method=PPS_SYS sampsize=&sample_size;
strata  productid; /* input file had to be sorted by strata in ascending order */
control shipdate ; * specify additional variables (other than strata) to sort by when performing sampling.;
id totalprice unitprice billdate numunits orderid orderlineid shipdate;
size orderlineid;
run;

proc contents data=adv6_18b;
run;
proc sort data= adv6_18b;
by productid;
run;

PROC SURVEYSELECT DATA = WORK.US
OUT = WORK.SAMPLE
METHOD = PPS_SYS /* URS, SRS, SYS, SEQ */
SAMPSIZE = 1000;
STRATA productid;
CONTROL POPULATION ;
SIZE HOUSHOLDS ;
RUN;



/**************************** Advance SAS Test**************************/

/*04OCT2009  20SEP2016*/
*******full is the entire sales data on orderline level ********;
proc contents data=full;
run;
proc sort data=full
out=countoir(keep=householdid subtotal  orderdate paymenttype channel);
by householdid;
where householdid ne .;
run;

proc contents data=countoir;
run;

*calculate the total number of purchases for each household;
data count (keep=householdid npurchase);
set countoir;
by householdid;
if first.householdid then Npurchase=0;
Npurchase+1;
if last.householdid then output;
run;



proc sql;
create table  count2 as
select householdid, count(householdid) as nPurchase
from countoir
group by householdid
order by householdid;
quit;

*calculate the total cost for each household;
data cost (keep=householdid  cost);
set countoir;
by householdid ;
if first.householdid then cost=0;
cost+subtotal;
if last.householdid then output;
run;

proc sql ;
select max(cost) as maxCost, min(cost) as minCost
from cost;
quit;
proc contents data=countoir;
run;
data cost2 (drop=customerid orderdate paymenttype channel);
set countoir;
by householdid ;
run;

proc sort data=cost2 out=cost3;
by householdid;
run;

proc sql;
create table cost_hshd as
select householdid, sum(subtotal) as houseTotal
from cost3
group by householdID
order by calculated houseTotal;
select max(houseTotal) as max, min(houseTotal) as min from cost_hshd;
quit;



*calculate the purchase time span for each household;
proc sort data=countoir;
by householdid orderdate;
run;
data timespan  (keep=householdid  timespan);
set countoir;
by householdid  orderdate;
retain temp;
if first.householdid then temp=orderdate;
if last.householdid then do;
timespan=intck('day', temp, orderdate);
output;
end;
drop temp;
run;

proc sort data=timespan nodupkey out=sortTimespan;
by householdid;
run;


*2.d calculate the mode of paymenttype for each household;
proc freq data=countoir noprint;
table householdid*paymenttype/out=paymentype;
run;
proc sort data=paymentype;
by householdid  descending count;
run;

data paytype;
set paymentype;
by householdid  descending count;
if first.householdid then output;
drop count percent;
run;


*2.e calculate the mode of campaign channel for each  household;
proc freq data=countoir noprint;
table householdid*channel/out=channel;
run;
proc sort data=channel;
by householdid  descending count;
run;

data channel;
set channel;
by householdid  descending count;
if first.householdid  then output;
drop count percent;
run;


*merged is the analysis table which contains total number of purchase, ;
*total cost, total purchase time span, most frequent paymenttype, most frequent
campaign channel ;
*for each  household;
data merged;
merge count cost timespan paytype channel;
by householdid ;
run;

/*******************save data for future use************/
data sdata.merged;
set merged;
run;

/*call back data*/
data merged;
set sdata.merged;
run;
*******************************************************;
proc freq data=merged;
table npurchase  paymenttype channel/missing;
run;

/*
Segment the households
*/
/*3. segment the hoseholds into 5 groups based on total cost*/
data merged;
set merged;
cost1=cost+ranuni(555)/1000000;
run;

proc rank data=merged group=5 out=merged (drop=cost1);
var cost1 ;
ranks cost_g ;
run;
/*4. segment households into 2 groups based on purchase date period*/
proc rank data=merged group=2 out=merged;
var timespan ;
ranks timespan_g ;
run;
/* 5. segment households into 2 groups based on totol number of purchase*/
proc rank data=merged group=2 out=merged;
var npurchase;
ranks npurchase_g;
run;

proc freq data=merged ;
table cost_g timespan_g npurchase_g ;
run;
*7. Produce the correlation matrix for total number of purchase, total cost, total purchase time span ;
proc corr data=merged outp=pvalue;
var npurchase cost timespan;
run;


proc corr data=merged;
var npurchase cost timespan;
run;

data pvalue;
set pvalue;
where _type_='CORR';
drop _type_;
run;

proc contents data=pvalue;
run;

data pvalue_1;
set pvalue (rename=(_name_=Var1 npurchase=corr));
Var2='Npurchase';
keep  var1 var2 corr;
run;

proc contents data=pvalue_1;
run;

data pvalue_2;
set pvalue (rename=(_name_=Var1 cost=corr));
Var2='cost';
keep  var1 var2 corr;
run;
data pvalue_3;
set pvalue (rename=(_name_=Var1 timespan=corr));
Var2='timespan';
keep  var1 var2 corr;
run;
/*??????????????? what he what to do: ask classmates: ???????????????*/
data pvalue_r;
set pvalue_1 pvalue_2 pvalue_3;
if var1<=var2 then delete;
run;


/* 7. Compute the correlation coefficient between the following pair of variables */
*7.a. Cost_g vs. timespan;
proc corr data=merged spearman outs=svalue1;
var cost_g;
with timespan;
run;
proc corr data=merged spearman;
var cost_g;
with timespan;
run;
/*get data from the outputed result table*/
data svalue1_r;
set svalue1 (rename=(cost_g=corr));
where _type_='CORR';
Var1='cost_g';
Var2='timespan';
drop _type_ _name_;
run;


*b. Timespan_g vs. Npurchase;
proc corr data=merged spearman outs=svalue2;
var timespan_g;
with npurchase;
run;
data svalue2_r;
set svalue2 (rename=(timespan_g=corr));
where _type_='CORR';
Var1='timespan_g';
Var2='npurchase';
drop _type_ _name_;
run;

*c. Channel vs. cost_g;
proc freq data=merged;
table channel*cost_g/chisq;
output out=cvalue1 cramv;
run;
data cvalue1;
set cvalue1(rename=(_cramv_=corr));
Var1='channel';
Var2='cost_g';
run;

*d. Paymenttype vs.cost_g;
proc freq data=merged;
table paymenttype*cost_g/chisq;
output out=cvalue2 cramv;
run;
data cvalue2;
set cvalue2(rename=(_cramv_=corr));
Var1='paymenttype';
Var2='cost_g';
run;


*e. Paymenttype vs. timespan_g;
proc freq data=merged;
table paymenttype*timespan_g/chisq;
output out=cvalue3 cramv;
run;
data cvalue3;
set cvalue3(rename=(_cramv_=corr));
Var1='paymenttype';
Var2='timespan_g';
run;

/*????????????????? what does it mean by partial out ????????*/
*f. Cost vs. Npurchase with the effect of timespan removed;
proc corr data=merged ;
var cost;
with Npurchase;
partial timespan;
run;


proc corr data=merged;
var cost;
with Npurchase;
run;

/*8. Output all correlation coefficients calculated in 7 and 8 (except f.),
tag them as 'Very strong', 'Strong',
'Moderate', 'Week', 'Very week' according to the scales indicated in the lecture notes.*/
proc format;
value degree 0.8-1='Very Strong'
            0.6-<0.8='Strong'
0.4-<0.6='Moderate'
0.2-<0.4='Weak'
0-<0.2='Very Week'
.='Missing'
other='Error'
;
run;
data all;
set pvalue_r svalue1_r svalue2_r cvalue1 cvalue2 cvalue3;
degree=put(corr, degree.);
run;

************************save data for future use**;
data sdata.all;
set all;
run;
*************** call back data *******************;
data all;
set sdata.all;
run;
****************************************************;

proc freq data=all;
table degree;
run;


proc sql;
create table order_channel as
select o.*, c.channel, year(orderdate) as year
from orders o inner join campaign c on o.campaignid=c.campaignid;
quit;


* Are the average sale amounts different between states 'New York' and 'New Jersey'?;
proc sort data=order_channel;
by state;
run;
/*Test if NY, NJ are the normal distribution, and variance equal */
proc univariate data=order_channel (where=(state in ('NY' 'NJ'))) normal;
var totalprice;
by state;
run;

PROC UNIVARIATE data=order_channel plot;
var totalprice;
RUN;

/*PROC UNIVARIATE < options > ;*/
/*CLASS variable(s) < / KEYLEVEL= value >;*/
/*VAR variable(s) ;*/
/*BY variable(s) ;*/
/*HISTOGRAM < variables > < / options > ;*/
/*FREQ variable ;*/
/*ID variable(s) ;*/
/*INSET keyword-list < / options > ;*/
/*OUTPUT < OUT=SAS-data-set > . . . < percentile-options >;*/
/*PROBPLOT < variable(s) > < / options > ;*/
/*QQPLOT < variable(s) > < / options > ;*/
/*WEIGHT variable ;*/
/*RUN;*/


/*T-Test???:*/
/*H0:Mean1=Mean2*/
/*T-Test????:*/
/*1. A,B independant*/
/*2. A,B are normal distribution*/
/*3. Variance of A,B are equal*/
proc ttest data=order_channel (where=(state in ('NY' 'NJ')));
class state;
var totalprice;
run;
/*Method Variances DF t Value Pr > |t| --> then H0: Mean1=Mean2 not support*/
/*Equal  0.0911 */
/*Unequal 0.0820 */
/**/
/*Equality of Variances : then H0: variance1=variance2 not hold*/
/*Value Pr > F */
/*1.14 <.0001 */

* Are the average sales amounts different campaign channels 'PARTNER', 'AD' and 'WEB'?;
/*GLM: General Linear Models*/
/*H0: the average sales amout among groups are equal*/
/*?????????????????? if different, which one is better, higher price ????*/
proc glm data=order_channel (where=(trim(channel) in ('PARTNER' 'AD'  'WEB')));
class channel;
model totalprice=channel;
/* MODEL dependents= independents */
run;
quit;

* Compare average sales amounts among different payment types.
If the differences among payment types are significant,
find the group differences by performing multiple comparison test;
proc glm data=order_channel;
class paymenttype;
model totalprice=paymenttype;
means paymenttype/duncan;
quit;

* Compare average sales amounts among years.
If the differences among years are significant,
find the group differences by performing multiple comparison test;
proc glm data=order_channel;
class year;
model totalprice=year;
means year/duncan;
quit;

* Use two-way analysis of variance to compare average sales amount
among four major states: 'New York', 'New Jersey', 'Florida' and 'California'
for campaign channels 'PARTNER', 'AD' and 'WEB'. ;
proc glm data=order_channel (where=(trim(channel) in ('PARTNER' 'AD'  'WEB')
and state in ('NY' 'NJ' 'FL' 'CA')));
class channel state;
model totalprice=channel|state;
means channel|state/duncan;
run;
quit;

proc means data=order_channel (where=(trim(channel) in ('PARTNER' 'AD'  'WEB')
and state in ('NY' 'NJ' 'FL' 'CA'))) nway;
class channel state;
var totalprice;
output out=means mean=;
run;
proc plot data=means;
plot totalprice*state=channel;
run;
quit;

************************************* Retail end Wireless Start ************************;

/*The default configuration location is c:\program files\SASHome\SASFoundation\9.3\nls\en\sasv9.cfg.*/

proc tabulate data=wireless missing;
var actdt deactdt;
table min*f=date. max*f=date. n*f=comma14. nmiss*f=comma14., actdt deactdt;
run;

/*2. tenure*/
data wireless;
set wireless;
if deactdt=. then tenure=&obs_date-actdt;
else tenure=deactdt-actdt;
churned=(deactdt ne .);
run;


proc univariate data=wireless plot;
var tenure;
run;

/*3 deactivate by month* EDA get understanding to data */
data wireless;
set wireless;
month=month(deactdt);
year=year(deactdt);
run;
proc sql;
create table month as
select year, month,
mdy(month, 01, year) as yearmon format=monyy7.,
       sum(case when deactdt=. then 0 else 1 end) as count
from wireless
where year ne .
group by year, month
order by 1 , 2;
quit;

/*4. forecast, handle forecast less than 0 , bundary effect */
proc forecast data=month lead=6 interval = month outfull out=f_out outresid;
var count;
id yearmon;
run;

proc sql;
select * from f_out where monotonic() between 1 and 200;
quit;

proc gplot data=month; /* time history */
symbol i=spline v=circle h=2;
plot count * yearmon;
run;

proc gplot data=f_out (where=( _type_ ne 'RESIDUAL'));
symbol1 i=none v=star h=2;
symbol2 i=spline v=circle h=2;
symbol3 i=spline l=20;
symbol4 i=spline l=20;
format yearmon monyy7.;
plot count * yearmon = _type_ /
href= "01feb2001"d ;
/*where _type_ ^= "RESIDUAL";*/
run;
quit;

/*5 segment*/
proc format;
value segment low-30 =1
            30<-60 =2
60<-365=3
365<-high=4
;
value segmentdesc
1 ='1 - less than 31 days'
            2 = '2 - between 31 and 60 days'
3='3 - between 61 days and a year'
4='4 - greater than a year'
;
value churned
1='Deactived'
            0='Still Alive'
;
run;

data wireless;
set wireless;
churned=(deactdt ne .);
segment=input(put(tenure, segment.), 4.0);
run;

proc tabulate data=wireless missing f=comma14.;
class churned segment;
table segment=''  all ='Total', (churned='' all='Total')*(N = '# of Accounts' pctn='Percent %' )  /box='Tenure';
format segment segmentdesc. churned churned. ;
run;


/*6,7 general association*/
proc freq data=wireless;
table segment*(goodcredit rateplan dealertype)/missing chisq;
table churned*(goodcredit rateplan dealertype)/missing chisq;
format segment segmentdesc. churned churned. ;
run;

/*8. segmentation*/
proc freq data=wireless;
table churned*segment/missing chisq;
format segment segmentdesc. churned churned. ;
run;



proc rank data=wireless groups=20 out=wireless_g;
var tenure;
ranks tenure_g; /* tie */
run;


proc sql;
select * from wireless_g where monotonic() between 1 and 10;
quit;
proc freq data=wireless_g;
table churned * tenure_g/missing chisq;
run;

proc rank data=wireless groups=5 out=wireless_g;
var tenure;
ranks tenure_g; /* tie */
run;

proc freq data=wireless_g;
table churned * tenure_g/missing chisq;
run;

/*2 stage rebin based on churn rate, first group to 20 groups,
then see data then change the group settings*/

proc sql;
create table wireles_g_1 as
select tenure_g, min(tenure) as min, max(tenure) as max
, sum(churned)/count(*) as col_churn format=percent11.2
from wireless_g
group by tenure_g;
quit;

/* re-group=data rebin-- use data step do the data rebin
based on column percent : see by your eye,
done by data sense + business sense*/
data wireless_g_2;
set wireless_g;
if tenure_g=0 then seg='a';
else if tenure_g in (1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) then seg='b';
else if tenure_g in (2, 13, 14, 15, 16, 17, 18) then seg='c';
else if tenure_g=19 then seg='d';
else seg='e';
run;



proc freq data=wireless_g_2;
table churned*seg/missing chisq;
run;

/*how to detect sinigicant difference*/
/*use t-test, log-rank test, wilcoxon test*/
/*halflife*/
proc lifetest data=wireless method=lt notable plots=(h s);
time tenure*churened(0);
strata goodredit;
run;

proc lifetest data=wireless method=lt notable plots=(h s);
time tenure*churned(0);
strata rateplan group=goodcredit;
run;

/*survival analysis*/
/*linear regression*/

/**************************** Advance SAS Test**************************/
/*get SAS work temp dir*/
%put %sysfunc(getoption(work));
%let sasTempWorkDir = %sysfunc(getoption(work));
%put &sasTempWorkDir;

/* Set statement: Syntax */
/*END = var */
/*KEY = */
/*NOBS = */
/*POINT = */
/*IN=*/


data adv8_1;
set timespans;
run;

/* 156258 obs*/
data adv8_2;
set timespans end=lastFlag ;
/*END= cannot be used with POINT=. When random access is used,
the END= variable is never set to 1
If you use a BY statement, END= is set to 1 when the SET statement reads
the last observation of the interleaved data set. For more information
*/
if lastFlag=0 then output;
run;

data adv8_2;
set timespans end=last;
if last then /* when end=1, means the record is the last one */
do;
 lastFlag = last;
 totalRecords = _N_;
 put  totalRecords=   lastFlag=;
 output;
end;
run;

data adv8_2;
set timespans end=lastFlag;
if lastFlag then /* when end=1, means the record is the last one */
do;
 totalRecords = _N_;
 put  totalRecords=   lastFlag=;
 output;
end;
else /* lastFlag = 0*/
do;
recordID = _N_;
output;
end;
run;

data adv8_3;
do i=1 to 1000;
set timespanS point=i;
output;
end;
stop;
run;

data ADV8_4(index=(channel));
set timespanS nobs=sales_cnt;  * ref to the total obs count;
if _N_ = 1 then
  put sales_cnt=;
else
 output;
run;

/*create a macro variable*/
data ADV8_4;
call SYMPUT('nobs_cnt2',put(sales_cnt,6.));
set timespanS nobs=sales_cnt;  * ref to the total obs count;
run;

%put &nobs_cnt2;  /* nobs_cnt2=156258  */
data ADV8_4;
set timespanS nobs=sales_cnt;  * ref to the total obs count;
call SYMPUT('nobs_cnt',put(sales_cnt,6.));
run;


data ADV8_5;
call SYMPUT('var1',6789); * ---------> numeric converted to char type;
/*correct call:*/
call symput('var2', '67890');
/*or correct call:*/
vars = 35566;
call symput('var21', put(vars, 5.));
vars1 = 75566;
call symput('var22', put(vars1, $5.));
call SYMPUT('var3','book');
call SYMPUT('var4',put(123445, 6.));
x = put(123445, 6.); * ----> x is char type ;
y = put('bookstore', $9.);
put x= y=;
run;
%put &var1 ;
%put &var2 ;
%put &var3 ;
%put &var4;
%put &var21;
%put &var22;
%put _ALL_;
%put _USER_;
%put _Automatic_;
%put this is string; /* no need to add quote */
%put 'this is string'; /* no need to add quote */
proc contents data=adv8_5;
run;

/*get macro variable inside a datastep*/
data adv8_6;
var1 = symget('var1');
var2 = symget('var2');
var3 = symget('var3');
put var1= var2= var3=;
run;

proc contents data=adv8_6;
run;
%put &nobs_cnt=;

/* time too long running */
/*data adv8_5;*/
/*set adv8_4 key=channel; */
/*run;*/

/*proc append:*/
/*the SET statement in a DATA step to concatenate two data sets,
the SAS System must process all the observations in both data sets
to create a new one.
The APPEND procedure bypasses the processing of
data in the original data set and adds new observations directly to
the end of the original data set. Therefore, the APPEND procedure
is more efficient*/

data adv9_a;
set sales.timespan_g(obs=200 firstobs=1);
run;

data adv9_b;
set sales.timespan_g(obs=400 firstobs=201);
run;

proc append base=adv9_a data=adv9_b;
run;
proc append base=adv9_a data=adv9_b;
run;
proc append base=adv9_a data=adv9_b force;
run;

/* If the DATA= data set contains variables that are longer than the variables in the
BASE= data set, the FORCE option must be used with PROC APPEND. Using the
FORCE option enables you to append the data sets. However, the DATA= variable
values will be truncated.

proc append -------force option --- Appending Variables with Different Lengths
If the DATA= data set contains variables that are longer than the variables in the
BASE= data set, the FORCE option must be used with PROC APPEND. Using the
FORCE option enables you to append the data sets. However, the DATA= variable
values will be truncated.

proc append -------force option -- Appending Variables with  Different Types
Force option , missing values are assigned to the DATA= variable values for the variable
whose type did not match.
*/
proc print data=adv9_a;
run;
proc contents data=adv9_a;
run;
proc reports data=adv9_a;
run;
proc catalog data=adv9_a;
run;


/*There were 200 observations read from the data set WORK.ADV9_B.*/
/*NOTE: 200 observations added.*/
/*NOTE: The data set WORK.ADV9_A has 400 observations and 7 variables.*/

/* drop a variable and test append again */
data adv9_b(drop=paymenttype);
set sales.timespan_g(obs=400 firstobs=201);
run;

proc append base=adv9_a data=adv9_b;  /* demage the original data */
run;
/*WARNING: Variable paymenttype was not found on DATA file.*/
/*NOTE: There were 200 observations read from the data set WORK.ADV9_B.*/
/*NOTE: 200 observations added.*/
/*NOTE: The data set WORK.ADV9_A has 600 observations and 7 variables*/

/*modify statement*/
proc contents data=adv9_a;
run;

data adv9_a;  /* the dataset should be the same name */
modify adv9_a;
cost = cost*1.12;
run;

data adv9_a1;
input school class;
cards;
1 30
2 32
;
data adv9_a2;
input school class;
cards;
1 20
1 21
2 26
2 28
3 37
;
data adv9_a2;
modify adv9_a2 adv9_a1;
by school;
run;
proc print data=adv9_a2;
run; ***Duplicate values of BY variables in Trans file ***
If duplicate values of the BY variable exist in the transaction data set,
the duplicate values overwrite each other so that the last value in the group
of duplicate transaction values is the result in the master data set ;

data trans;
input school class;
cards;
1 30
1 34
2 32 ;
data master;
input school class;
cards;
1 20
2 26
;
run;
/*result*/
/* 1        1        30*/
/* 2        1        21*/
/* 3        2        32*/
/* 4        2        28*/
/* 5        3        37*/

/*PROC DATASETS ;*/
/*Age/Append/Audit/Change/Copy/Exclude/Select/Delete/Modify/Format*/
/*IC Create/IC Delete/IC Reactivate/Index Create/Index Delete/Informat*/
/*Label/Rename/Repair/Save*/
libname ad2 "&prjPath\sasdata";
data ad2.ad2_1;
set sales.customer;
run;

proc datasets library=ad2;
age ad2_1 ad2_2;  /* delete ad2_2 --> copy ad2_1 to ad2_2*/
run;
proc datasets library=ad2;
append base=ad2_2 data=ad2_1;
run;
proc contents data=ad2._all_ nods;
run;

proc datasets library=ad2;
change ad2_1=ad2_1a; * change oldname=newname;
quit;

data channels;
set sales.timespan_g(keep=channel paymenttype cost Npurchase);
run;


proc datasets library=ad2;
copy in=work out=ad2;  /* specify source libname and target libname ,
normally copy used for transfer data from one system to another system.*/
select channels; /* specify the dataset name */
run;
quit;

proc datasets library=ad2;
copy in=work out=ad2;  /* specify source libname and target libname ,normally copy used for transfer data from one system to another system.*/
select channels;  /* specify the dataset name */
run;
quit;

/*
use change to change dataset name,
use modify+rename to change variable name
*/
proc datasets library=ad2;
change channels=chn;
copy in=ad2 out=work;
select chn;
run;
quit;

proc datasets library=ad2;
copy in=ad2 out=work;
* select :  if not specify select, then copy all the dataset;
run;
quit;


proc datasets library=ad2;
copy in=work out=ad2;
exclude channels;  /* only not copy channels, others copy all */
run;
quit;

proc contents data=ad2._all_ nods;
run;


/*customerid*/
/*firstname*/
/*gender*/
/*householdid*/

proc datasets;
delete adv8_5  chn trans;
run;
quit;

/*Save ?will delete all files except those listed on the SAVE statement*/
proc datasets library=ad2;
save master orderline;
run;
quit;

/*For an existing dataset the MODIFY command is the best way to make changes because
no observations are read in or written out during processing.
Using a data step with a set statement you can also make changes, however all
observations are read in & written out.
In a large dataset time and storage can be significant.*/
proc datasets library=ad2; /* only change the qualified data, more efficiency than datastep modify*/
modify master;
rename class = classId;
run;
quit;
proc contents data=ad2.master;
run;

libname fmtLibs "&prjpath\sasformat";  /* permant library */
proc format library=fmtLibs;
value answer
1 ='Yes'
2 ='No';
run;
proc format library=fmtLibs;
value tt
low - < 0 = 'below zero'
0 - < 5 = 'cold'
5 - < 10 = 'mild'
10 - < 15 = 'warm'
15 - high = 'hot'
;
run;

proc format  library=fmtLibs;
value gfmt
low-30 = 'low'
31-50 = 'mid'
51-high='high'
;
run;
proc format;
value  grade
low - <10='youth'
10 - <30 = 'yonge'
30 - <60='adult'
    60 - high='senior'
;
run;
proc format;
value $state
'New York', 'Akansas', 'Cary' = 'NC'
'NorthCaro'  = 'SY'
;
run;
/*use of format: gender=put(var, $fmt.);*/
proc format;
value $gdfmt
'M', 'm' ='Male'
'F', 'f'='Female';
run;

proc contents data=sales.customer;
run;

data customer(drop=gender);
set  sales.customer;
gndr = put(gender, gdfmt.);
run ;

proc print data=customer(obs=9);
run;

proc setinit;
run;

/*Data Validation with FORMATS*/
/*detect price outrange*/
proc print data=sales.orderline(obs=9);
run;
/*unitprice*/
proc format;
value priceRange
low - <0 = 'invalid'
0 - 10 = 'low'
10-5000 = 'mid'
other ='high'
;
run;
data  ad2_3(drop=priceGrp);
set sales.orderline;
priceGrp = put(unitprice, priceRange.);
select(priceGrp);
when('low') lowp+1;
when('invalid') put "invalid";
otherwise;
end;
if priceGrp in ('invalid', 'low') then
priceLabel = 'low price';
else
priceLabel = 'high price';
run;

/*subset var group use format*/
/*handle date re-bin*/

proc format ;
value bDateFmt
low -18322 = 'Early'   /* date1=mdy(03,01,2010); */
18322 - 19053 = 'Late'  /* date2=mdy(03,01,2012); */
other = 'N/A'
;
run;
data  ad2_3;
date1=mdy(03,01,2010);
put date1=;
a = put(date1, date9.);
put a=;
run;
data  ad2_3;
date1=mdy(03,01,2010);
a = put(date1, date9.);
date2=mdy(03,01,2012);
b = put(date2, date9.);
put a= b=;
run;

/* re-bin billdate format */
data ad2_3(keep=bDateGrp);
set sales.orderline;
bDateGrp = put(billdate, bDateFmt.);
run;

/* subset group use format */
data ad2_3;
set sales.orderline;
where(put(billdate, bDateFmt.) eq 'Early');
run;

proc format library=fmtLibs fmtlib;
run;
/*ERROR: File FMTLIBS.FORMATS.CATALOG was created for a different operating system.*/

/*add format search list: format library list*/
options fmtsearch=(fmtLibs work);  *same as dos path enviroment variable;
options fmtsearch=(fmtLibs work.formats);  *same as dos path enviroment variable;
/*the search order is : work.formats --> fmtlibs*/

/*Proc Sort*/
data ad2_3(sortedby=patid); /* add a sorted flag to a sorted dataset */
input patid @@;
cards;
110 109 111
;
run;
proc contents data=ad2_3;
run;
/* Sorted YES */
/* remove a sorted flag  */
data ad2_3(sortedby=_null_);
set ad2_3;
run;
proc contents data=ad2_3;
run;
/* Sorted No */
data ad3_1;
set sales.orderline;
run;
proc contents data=ad3_1;
run;
proc freq data=ad3_1;
tables billdate;
run;
proc sort data=ad3_1 out=ad3_2;
by orderid;
run;

proc sort data=ad3_1(firstobs=1 obs=300) out=ad3_3;
by orderid;
run;


proc sort data=ad3_1 out=ad3_4;
by orderid;
where  billdate eq mdy(11,28,2010);
run;

proc sql;
select * from ad3_1 where billdate between '23Mar2010'd and '21Sep2010'd;
quit;

proc sql;
select * from ad3_1 where billdate between '23Feb2011'd and '21Jan2011'd;
quit;


proc sort data=ad3_1 out=ad3_5;
by orderid;
where  billdate gt mdy(11,28,2010);
run;

proc sort data=ad3_1 out=ad3_6;
by orderid;
where  billdate between mdy(11,28,2009) and mdy(11,28,2010);
run;

proc print data=ad3_6(obs=20);
by billdate;
run;
proc sort data=ad3_1 out=ad3_7 noduprec ;
by orderid;
run;

proc contents data=ad3_1;
run;

proc sort data=ad3_1 out=ad3_7 nodupkey ;
by productid;
run;

data ad3_rebinDate;
set ad3_6;
if (billdate gt  mdy(11,28,2009)) and (billdate lt mdy(12,31,2009)) then bDtGrp = 1;
run;
/* subset group use format */
data ad2_5;
set sales.orderline;
run;


/* delete some data */
data ad2_5;
set ad2_5;
if (billdate gt mdy(01,01,2010)) and (billdate lt mdy(05,01,2010)) then
delete;
else
output;
run;


/*Integrity Constraints are a set of data validation rules
that you can specify to restrict the data values accepted into a SAS data file*/
proc contents data=ad2_5;
run;


/*create contraints*/
proc datasets nolist ;
modify ad2_5;
ic create val_price = check(where=(totalprice gt 0))
message = "total price too high, > 100"
;
/*four types of constraints*/
/*Check*/
/*Not Null*/
/*Unique*/
/*Primary Key*/

quit;

/*delete contraints*/
proc sql;
alter table ad2_5 DROP CONSTRAINT val_price;
quit;
/* Integrity constraint val_price deleted*/

/*macros*/
data ad4_1;
/*sasday,sasdate, sastime*/
put _all_;
run;
%put _all_;
/*example: user-defined macro variable*/;
/* list macro variables
_all_
_global_
_local_
_user_
_automatic_
*/

%macro createDS;
data ad4_1;
set sales.orderline;
run;
%mend;
%createDS;



proc contents data=ad4_1;
run;
%macro sortUnique;
proc sort data=ad4_1 out=ad4_2;
by orderid;
run;
%mend;
%sortUnique;
%macro ad4_3;
%local orderid ;
%global unitprice Male;
%let orderid = 229933;
%let unitprice = 33333;
%let Male = M;
%let year = 1998;
%let month = Jan;
%put &orderid;
%put YM = &year&month;
%let ya = 1998;
%let y = ya;
%put year=&&&y;
%mend;
%ad4_3;
%put &unitprice;
%put &unitprice;
%put Just_&Male;


/*If a pair of ampersands (&&) is encountered, the pair is resolved to a single
ampersand,
It resolves the entire reference from left-to-right.
If a pair of ampersands (&&) is encountered, the pair is resolved to a single
ampersand, then the next part of the reference is processed.
It returns to the beginning of the preliminary result and starts
resolving again from left-to-right.
When all ampersands have been fully processed, the resolution is complete.
*/
%macro ad4_4;
%let one=yourname;
%let last=one;
%put &&&&last &&&last;
%let test=one;
%let one= two;
%let two=three;
%let three=four;
%put &&&&&test; * &&one-->&one-->two;
%put &&&&&&&test; * & & & one --> &two --> three;
%put &&&&&&&&&test; * & & & one --> &two --> three from left to right, ;
%mend;
%ad4_4;

%macro ad4_5;
%let first=yourname;
%let last=first;
%put &&last;
%mend;
%ad4_5;

%let first=yourname;
%let last=first;
%put &&last;
%put &&&last;
%put &&&&last;


%macro ad4_5;
%let car='Johns Car';
%let Belongs=Car;
%put &&&belongs &&&&Belongs &&&&&Belongs &&&&&&Belongs &&&&&&&&Belongs;
%mend;
%ad4_5;  * 'Johns Car' Car 'Johns Car' 'Johns Car' Car;

%macro ad4_6;
%let car='Johns Car';
%let Belongs=Car;
%let Property = Belongs;
%put &&&Property &&&&Property &&&&&Property &&&&&&Property &&&&&&&&Property;
%mend;
%ad4_6; * Car Belongs Car Car Belongs ;
%macro ad4_7;
%let car='Johns Car';
%let Belongs=Car;
%let Property = Belongs;
%put &&&Property & & & &Property &&&&&Property &&&&&&Property &&&&&&&&Property;
%mend;
%ad4_7; * Maxium two lines  ;
%macro ad4_8;
%put &SYSDATE;
%put &SYSDATE9;
%put &SYSDAY;
%put &SYSTIME;
%put &SYSENV;
%put &SYSSCP;
%put &Sysver;
%put &sysJobID;
%put &SysLast;
%put &SysParm;
%put &SysErr;
%mend;
%ad4_8;
/*
07APR10
07APR2010
Wednesday
17:08
FORE
WIN
*/
/*show system default macro variables*/
%macro ad4_9;
%put _all_;
%put _global_;
%put _local_;
%put _user_;
%put _automatic_;
%mend;
%ad4_9;

%macro ad4_3;
%local orderid ;
%global unitprice Male;
%let orderid = 229933;
%let unitprice = 33333;
%put _local_;
%mend;
%ad4_3;
%put _all_;
%put _global_;
%put _local_;
%put _user_;
%put _automatic_;
/* Syntax
%MACRO macro-name (positional? keyword=);
SAS Statements;
Macro Statements;
%MEND;
*/
/*A macro definition or macro text is placed between a %MACRO statement and a */
/*%MEND (macro end) statement. */
/*It can include the following: */
/*Constant text including SAS dataset and variable names and statements */
/*Macro variables preceded by an ampersand Macro program statements preceded */
/*with a percent sign (%). */
/*Macro expressions same as tasks done in base SAS */
/*Macro functions similar to base SAS*/


%macro ad4_fabs(a,b,c); /* positional parameters */
%* read a, then output;
%put &a;
%mend;
%ad4_fabs(2222, 333,444);

/*Macro Positional Parameters -> Keyword Parameters */
%macro ad4_fabs2(age=5);
%* read a, then output  macro comment by %* ;
%put &age;
%mend;
%ad4_fabs2();
%put _global_;
%symdel Male; /* delete global macro variables*/

%ad4_fabs2();
/*Call Symput('macroVar', value) --> in datastep*/
/*Call Symget('macroVar')*/

/*macro use mixed parameters*/
%macro ad4_fabs3(pam1, pam2=good);
%let out=&pam1.&pam2;
%put &out;
%mend;
%ad4_fabs3(bad); * --> badgood ;
%ad4_fabs3(bad, pam2=bad);  * badbad, mixed macro parameters ;
%macro ad4_fabs4(pam1, pam3, pam2=good);  /*  All positional parameters must precede keyword parameters. */
%let out=&pam1.&pam2.&pam3;
%put &out;
%mend;
%ad4_fabs4(bad,bad , pam2=bad);
/*variable number of  parameters*/

%macro printz/parmbuff;   /* marco use parmbuff */
%put Syspbuff contains: &syspbuff;
%let num=1;
%let dsname=%scan(&syspbuff,&num);
%do %while(&dsname ne);
proc print data=&dsname(obs=2);
run;
%let num=%eval(&num+1);
%let dsname=%scan(&syspbuff,&num);
%end;
%mend printz;
%printz(orderline); /* print one dataset : parmbuff, syspbuff, */
%printz(orderline, ad4_5); /*  print two dataset */
%printz(orderline, ad4_5, ad4_5); /*  print 3 dataset */

/*local varible nested*/
/*
each macro program in the example above has its own local symbol
table that exists as long as the macro executes. When a macro finishes executing, its
local symbol table and all of the local macro variables that are contained in that table
are erased. */
option mprintnest; /* print nested infor */
option mlogic mlogicnest; /* must set both option*/
%macro outer;
%local variX;
%let variX=one;
%inner
%mend outer;
%macro inner;
%local variY;
%let variY=&variX;
%put var is &variY ;
%mend inner;
%outer;
%put &variX;


/* default macro storage: Work.Sasmacr.MacroName.Macro.*/
%INCLUDE file-specification </SOURCE2>;
%include 'c:\sasfiles\prtlast.sas' /source2;
/*source2: whether display the included macro code in sas log */
options source2;
/* SOURCE2
causes the SAS statements that are inserted into the program to be displayed in the SAS log.
If SOURCE2 is not specified in the %INCLUDE statement, then the setting of the SAS
system option SOURCE2 controls whether the inserted code is displayed.
*/
data ad4_5;
set sales.orderline;
date = '05';
call symput('shpdt', shipdate);  *----------> create Macro Variable in data step ;
if shipdate lt mdy(05,23,2010) then output;
run;
%put _user_;



data ad4_5;
shipdate = '02Jan1960'd;
put shipdate=;
/*create a macro variable named "shpdt05"*/
run;


data ad4_5;
date = '05';
shipdate = '30Jan2012'd;
call symput('shpdt'||date, shipdate);  *----------> create Macro Variable in data step ;
call symput('shpdt'||'06', shipdate);  *----------> create Macro Variable in data step ;
today = "&Sysdate";
put today=;
/*create a macro variable named "shpdt05"*/
run;



%put &shpdt05;  * --------------> use Macro Variables in Session Enviroment;
%put &shpdt06;  * --------------> use Macro Variables in Session Enviroment;
%symdel Shpdt;

data ad4_6;
date = "&SYSDATE";
dt2 = "&SYSDATE9";  * 10JUL2014;
day = "&SYSDAY";    * Thursday;
tm = "&SYSTIME"; * 21:29;
env = "&SYSENV";
scp = "&SYSSCP";    * WIN;
ver = "&Sysver";    * 9.3;
jobid = "&sysJobID";
last = "&SysLast";  * =WORK.AD4_6;
parm = "&SysParm";
err = "&SysErr";
* today = &sysdate; * error ;
put date= dt2= day= tm= env= ver= scp= jobid= last= parm= err=;
put today=;
run;

data ad4_7;
call symput('var1', '05Jan2010'd);
call symputx('var2', '   a b cdefg   ');  /* remove leading and trail blanks*/
call symput('var3', '   a b cdefg   ');
run;
%put &var1; *  18267;
%put &var2||'wwwww';  * a b cdefg||'wwwww';
%put &var3||'wwwww';  * a b cdefg   ||'wwwww';

data ad4_8;
price = input(symget('sysdate'), date9.);  * numeric ;
dt = symget('sysdate');  * ---> char;
put dt=;
put price=;
/*dt          Char    200*/
/*price       Num       8*/
run;
proc contents data=sales._all_ nods;
run;

proc contents data=ad4_8;
run;

/*using macro vars in proc sql*/
data ad4_7;
set sales.campaign;
call symput('cmpid', campaignid);
call symput('cmpname', campaignname);
call symput('channel', channel);
run;
proc contents data=ad4_7;
run;
proc sql;
select * from ad4_7 where monotonic() between 1 and 12;
quit;
proc sql;
select campaignid into :cmpid from ad4_7  where monotonic() between 1 and 12;
quit;


%put &cmpid;
proc sql;
select count(campaignid) into :cmpCnt from ad4_7 ;
quit;


%put Campaign Count= &cmpCnt;


proc sql;  /*count -- did not include the missing value*/
select distinct campaignid, count(distinct campaignid) into :cmpCnt from ad4_7 ;
quit;


proc sql noexec; /* CHECK SQL SYNTAX, BUT NOT EXCUTE SQL SCRIPT*/
select  count(distinct campaignid) into :cmpCnt from ad4_7 ;
quit;

proc sql;
select distinct campaignid from ad4_7 ;
quit;

proc sql;
validate /* only check the select clause which follows validate., validate syntax*/
select distinct campaignid from ad4_7 ;
quit;


proc sql;
select campaignid from ad4_7 ;
quit;

%put Campaign Count= &cmpCnt;


/*select to a group of macro variables*/
proc sql;
select count(campaignid) into :n1-:n8 from ad4_7
    group by channel;
quit;
%put Campaign Count= &n1 , &n2, &n3, &n4, &n5, &n6, &n7, &n8;

proc sql;
select channel into :channel from ad4_7  where monotonic() between 1 and 12;
quit;
%put &cmpname;
%put &channel;
%put &cmpid;


data ad4_5;
set sales.orderline;
run;
proc contents data=ad4_5;
run;
proc sql;
select * from ad4_5 where monotonic() between 1 and 2;
select orderid into :ids separated by ',' from ad4_5 where monotonic() between 1 and 10;
quit;
%put &ids;
data ad4_6;
set sales.orders;
run;
proc contents data=ad4_6;
run;
/*create macro variables, separated by ',', then use the macro variable*/
proc sql;
select "'" || paymenttype || "'" into :ids2 separated by ',' from ad4_6 where monotonic() between 1 and 10;
quit;
%put &ids2; * 'DB','DB','VI','VI','MC','AE','VI','AE','AE','DB' ;
/*use the created macro variable: where in()*/
proc sql noprint;
select * from ad4_6 where paymenttype in (&ids2);
quit;
proc sql ;
create table iList as
select * from ad4_6 where paymenttype in (&ids2);
select * from iList where monotonic() between 1 and 2;
quit;
/*
Placement of SAS code within macros has various advantages.
1) Program code reduction
2) Eliminate repetitive changes
3) Provide conditional execution
4) Reduce typing errors. Below are examples of simple macro executions.
*/

%macro ad4_8;
%if 6>5 %then
%do;
%put "somthing";
%end;
%else
%do;
%put "some other thing";
%end;
%mend;

/*include macros and then call macros*/
%include "&macroPath\macros.sas";
%inMacro;

/*call macros from data step*/
/*define macro then call it*/
%macro ad4_8(param);
%put &param;
%mend;
data ad4_9;
call execute('%inMacro');
call execute('%ad4_8(3445555)');
run;
%macro ad4_10(id);
%do i=1 %to 10;
data ad4_&i;
%put %eval(&id)_&i;  
  run;
    %end;
%mend;
%ad4_10(1223);
data _null;
call execute('%ad4_10(1223)');
run;
/*create macro from proc sql*/
proc sql;
select  paymenttype into :ids2 separated by ',' from ad4_6 where monotonic() between 1 and 10;
quit;
%put &ids2; * DB,DB,VI,VI,MC,AE,VI,AE,AE,DB;


proc contents data=ad4_7 nods noprint out=ad4_8;
run;

proc contents data=ad4_7 nods ;
run;
proc print data=ad4_8;
run;


proc sql;
select channel into
:channel separated by ','
from ad4_7  where monotonic() between 1 and 90;
quit;
%put &channel;

proc sql;
select channel,discount into
:channel separated by ',' ,
:discount separated by ','
from ad4_7  where monotonic() between 1 and 90;
quit;
%put &channel and &discount;
%macro ad4_8;
%do i=1 %to 10;
%put &channel(i);
%end;
%mend;
%ad4_8;

%LET
%PUT
%DISPLAY ;
%GLOBAL
%INPUT
%MACRO
%INCLUDE
%IF %THEN; %ELSE;
%DO ; %END;
%GOTO & %label;

/*Macro functions*/
/*Macro functions all begin with a percent sign (%) and mostly operate the same
as their base SAS function counterparts do.*/
%LENGTH %SCAN %SUBSTR %UPCASE %SYMGET

%let a=begin;
%let b=%nrstr(&a);
%put UPCASE produces: %upcase(&b);
%put QUPCASE produces: %qupcase(&b); * --> &A   %qupcase , do not resolve ;

%STR --- masks the semicolon by quoting it.
%let createDS=%str(data ad4_9; set salse.orderline ;  run;);
%put &createDS;

%NRSTR --- not resolve &
%let ctStr = %NRstr(A&B);
%put &ctStr;   /* A&B */


* %EVAL --- compute the arithmetic computation  ;
%LET X=5;
%LET Y=&X+1;
%LET Z=%EVAL(&X+1);
%LET AB=%EVAL(&X/&y);
%let Cd = &X/&Y;
%PUT &X &Y &Z &AB &Cd ;  
*----- 5 5+1 6 2 5/5+1 ---------------;
%LET Z=%EVAL(&X+1.8); ----> ERROR: A character operand was found in the %EVAL function
%PUT &Z; *** ----------- EVAL require integer;

%let real=2.4;
%let int=8;
%put value=%eval(&real+&int); * ----> Error:  A character operand was found in the %EVAL function

* %SYSEVALF(a+b, Integer|ceil|floor) ---------> evaluate floating expression;
%let b=2.4;
%let a=8.999;
%put value=%sysevalf(%sysevalf(&a)+%sysevalf(&b), ceil);
%put value=%sysevalf(1.2+1.3, ceil);
%put value=%sysevalf(%sysevalf(&a)+%sysevalf(&b));  * float;
%put value=%sysevalf(%sysevalf(&a)+%sysevalf(&b), integer);
%put value=%eval(&a+1);
%put value=%eval(1.4+1);  * error
%put value=%eval(1+1);
%put value=%eval(%sysevalf(1.4)+1);  * error ;

%macro check(num=4);
%let result=%sysevalf(&num+0.5);
%put result is &result;
%mend;
%check(num=10);
%check(); * 4.5 ;
%check(num=12);  * 12.5;


%macro check2(num=4, lens=5);
%let result=%sysevalf(&num+0.5);
%let total = %eval(%eval(&num) + %eval(&lens));
%put result is &result, total is &total;
%mend;
%check2(num=4,lens=5);
%check2();
/*Macros stored in this temporary SAS catalog are deleted at the
end of the SAS session.
A macro is compiled when the %INCLUDE statement is submitted
*/
%macro check3(num=4.4, lens=5.5);
%let result=%sysevalf(&num+0.5+&lens);
%put result is &result;
%mend;
%check3(num=4.9,lens=5.5);
%check3();



/*Syntax*/
PROC SQL <liste-option> ;
ALTER TABLE instruction-alter ;
CREATE instruction-create ;
DELETE instruction-delete ;
DROP instruction-drop/view, table, index from ;
INSERT instruction-insert ;
RESET <list-option> ;
SELECT instruction-select ;<options>
Distinct object-item INTO macro-variable-specification
FROM from-list
WHERE sql-expression
GROUP BY group-by-item
HAVING sql-expression
ORDER BY;

/*The HAVING clause: together with group by
is used in combination with the GROUP BY clause.
It is similar to the Where clause, but the Having clause establishes
restrictions that determine which records are displayed after they have
been grouped.*/
/*
To do this Use this statement
---------------------------------------------
Modify, add, or drop columns ALTER TABLE
Establish a connection with a DBMS CONNECT
Create an index on a column CREATE INDEX
Create a PROC SQL table CREATE TABLE
Create a PROC SQL view CREATE VIEW
Delete rows DELETE
Display a definition of a table or view DESCRIBE
Terminate the connection with a DBMS DISCONNECT
Delete tables, views, or indexes DROP
Send a DBMS-specific nonquery
SQL statement to a DBMS EXECUTE
Add rows INSERT
Select and execute rows SELECT
Query a DBMS CONNECTION TO
Modify values UPDATE
*/

data ad5_1;
set sales.campaign;
run;
proc sql;
 select * from ad5_1 (obs=100);
quit;

proc sql;
describe table ad5_1;
quit;
proc sql;
   select * from ad5_1 (obs=100)
   where channel in ('MAIL','WEB');
quit;

proc sql;
   select * from ad5_1 (obs=100)
   where campaignname is missing;
quit;

proc sql;
   select * from ad5_1 (obs=100)
   where campaignname is null; /* equal to campaignname=' ', or price=.  */
quit;

proc sql;
   select * from ad5_1 (obs=100)
   where campaignname is not null;
quit;

data ad_order;
set sales.orders;
run;
proc sql;
describe table ad_order;
quit;


/*totalprice,    numorderlines ,    numunits*/
proc sql;
select
AVG(totalprice) as aPrice ,
Mean(numOrderlines) as mOdln,
Count(orderid) as totalOdln,
Freq(orderid) as ttOdln,
Cv(totalprice) as coPrice,
Max(totalPrice) as highPrice,
Min(totalPrice) as lowestPrice,
Nmiss(totalprice) as NMissing,
Var(totalPrice) as variancePrice,
Sum(totalPrice) as totalSaleAmt,
Range(totalPrice) as RangePrice,
Range(numunits) as rangeUnits
from ad_order;
quit;
proc sql;
select
totalprice,
Range(totalPrice) as RangePrice,
Max(totalprice) as High,
Min(totalprice) as low,
Range(numunits) as RangeUnits,
Max(numunits) as maxUnits,
Min(numunits) as minUnits,
Range(numorderlines) as RangeOrdln,
Max(numorderlines) as maxOrdln,
Min(numorderlines) as minOrdln
from ad_order
where monotonic() between 1 and 2
;
quit;

proc sql;
select
totalPrice
from ad_order
    where totalprice ge 5000; /* ge: good */
;
quit;


proc sql;
select
totalPrice, numunits
from ad_order
    where totalprice > 5000 and numunits ge 30; /* ge ==== > */
;
quit;


proc sql feedback; /* replace *, resolve macros , add to log */
select
totalPrice, numunits
from ad_order
    where totalprice > 5000 and numunits ge 30;
;
quit;

proc sql outobs=10;  /* WARNING: Statement terminated early due to OUTOBS=10 option.*/
select totalPrice, numunits
from ad_order
    where totalprice > 5000 and numunits ge 3;
;
quit;

proc sql inobs=300 outobs=10;  /* WARNING: Statement terminated early due to OUTOBS=10 option.*/
select totalPrice, numunits
from ad_order
    where totalprice > 5000 and numunits ge 3;
;
quit;


/*SQL: Summerize data,  summary functions, the first is SQL func, second is SAS func*/
/*
AVG, MEAN mean or average of values
COUNT, FREQ, N number of nonmissing values
CV coefficient of variation (percent)
MAX largest value
MIN smallest value
NMISS number of missing values
RANGE range of values
STD standard deviation
STDERR standard error of the mean
SUM sum of values
VAR Variance
*/
data wild;
input name $;
datalines;
Diana
Diane
Dianna
Dianthus
Dyan
;
run;
proc sql;
/*
LIKE 'D_an' Dyan
LIKE 'D_an_' Diana, Diane
LIKE 'D_an__' Dianna
LIKE 'D_an%' all names from the list
*/
select * from wild where name like 'D_an'; /* single wild*/
select * from wild where name like 'D_an_'; /* single wild*/
select * from wild where name like 'D_an__'; /* single wild*/
select * from wild where name like 'D_an%'; /* single wild*/
quit;

data wild;
input address $ 1-30;
datalines;
12 Piedpiper Place
39 Pepper Place
12 Piney Place
78 Piper Place
14 Picture Place
45 Pecan Place
;
run;
proc print data=wild;
run;
/*SQL options: nodouble, feedback, noprint, INOBS=n, OUTOBS=n, number*/
proc sql double;/* nodouble, feedback, noprint, INOBS=n, OUTOBS=n, number| |*/;
data ad5_2;
set sales.customer;
run;
/* nodouble, feedback, noprint, INOBS=n, OUTOBS=n, number| |*/;
proc sql feedback inobs=50;
select customerid, firstname, gender,householdid from ad5_2
;
quit;

proc sql feedback inobs=50 outobs=80;
select customerid, firstname, gender,householdid from ad5_2
;
quit;

proc sql feedback noprint;
select customerid, firstname, gender,householdid from ad5_2
;
quit;


proc sql feedback;
select customerid, firstname, gender,householdid from ad5_2
where monotonic() between 1 and 10;
quit;
/*
NOTE: Statement transforms to:
        select AD5_2.customerid, AD5_2.firstname, AD5_2.gender, AD5_2.householdid
          from WORK.AD5_2
         where (MONOTONIC() between 1 and 10);
*/

proc sql double;
select customerid, firstname, gender,householdid from ad5_2
    where monotonic() between 1 and 10;
quit;


proc sql;
select distinct customerid, firstname, gender,householdid from ad5_2
    where monotonic() between 1 and 100000;
quit;

/*
To create a list of unique Customer_Id values from the customer data set, which
of the following techniques can be used?

technique 1: proc SORT with NODUPKEY and OUT=
technique 2: data step with IF FIRST.Customer_Id=1
technique 3: proc SQL with the SELECT DISTINCT statement

answer: All, 1,2,3
*/

proc sql;
select distinct firstname, gender,householdid from ad5_2
    where monotonic() between 1 and 10000;
quit;

proc sql ;
select * from ad5_1;
quit;

/*SQL Dictionary*/
/*
DICTIONARY.OPTIONS System options
DICTIONARY.TITLES Title and footnote lines
DICTIONARY.EXTFILES Filerefs
DICTIONARY.MEMBERS SAS files
DICTIONARY.CATALOGS Catalogs
DICTIONARY.MACROS Macros
DICTIONARY.TABLES SAS data files
DICTIONARY.VIEWS Views
DICTIONARY.COLUMNS Variables in SAS datasets
DICTIONARY.INDEXES Indexes
*/

proc sql;
 create table ops as
 select * from DICTIONARY.OPTIONS;
quit;

/*add constraints*/
proc sql;
create table work.employees
(ID char (5) primary key,
Name char(10),
Gender char(1) not null check(gender in ('M','F')),
HDate date label='Hire Date');
quit;

proc sql;
create table work.discount3
(Destination char(3),
BeginDate num Format=date9.,
EndDate num format=date9.,
Discount num,
constraint ok_discount check (discount le .5) MESSAGE='message-string',
constraint notnull_dest not null(destination) MESSAGE='message-string');
quit;

/* describle constraints in a table */
proc sql;
describe table constraints discount3;
quit;

proc sql ;
describe view ;
quit;
/* add undo policy */
proc sql undo_policy=none; /* required/none/optional */

quit;
proc contents data=ops;
run;

proc sql ;
select distinct group into :grp seperated by ',' from dictionary.options;
quit;
%put &grp;
/*
CODEGEN,COMMUNICATIONS,DATAQUALITY,EMAIL,ENVDISPLAY,ENVFILES,ERRORHANDLING,EXECMODES,EXTFILES
,GRAPHICS,HELP,INPUTCONTROL,INSTALL,LANGUAGECONTROL,LISTCONTROL,LOGCONTROL,LOG_LISTCONTROL,MA
CRO,MEMORY,META,ODSPRINT,PDF,PERFORMANCE,SASFILES,SECURITY,SORT,SQL,SVG,UNKNOWN
*/
proc sql;
select * from dictionary.options where group='EMAIL';
quit;
proc sql;
select * from dictionary.options where group='GRAPHICS';
quit;
proc sql;
select * from dictionary.options where group='MEMORY';
quit;

proc sql;
select * from DICTIONARY.TITLES; * Title and footnote lines ;
quit;

proc sql stimer outobs=2;
create table aa as
select * from orderline; * Title and footnote lines ;
select * from aa;
quit;

proc sql outobs=20 double;
select * from orderline; * Title and footnote lines ;
quit;

proc sql stimer; /* give each step excute time */
select * from orderline where monotonic() between 1 and 5000; * Title and footnote lines ;
select * from orderline where billdate > '21Jun2009'd and  monotonic() between 1 and 5000;
quit;

/*
in proc sql:
The DOUBLE | NODOUBLE and the FLOW | NOFLOW | FLOW=n| FLOW=n
m options do not affect the appearance of HTML, PDF, or RTF output that is
created with the Output Delivery System.
*/
/*
247      select * from orderline where monotonic() between 1 and 5000;
NOTE: SQL Statement used (Total process time):
      real time           8.21 seconds
      cpu time            0.23 seconds

247!                                                                   * Title and footnote lines
247!  ;
248      select * from orderline where billdate > '21Jun2009'd and  monotonic() between 1 and
248! 5000;
NOTE: SQL Statement used (Total process time):
      real time           0.15 seconds
      cpu time            0.15 seconds

*/


proc sql stimer outobs=20 flow=80 55;
select * from orderline; * Title and footnote lines ;
quit;

proc sql nostimer outobs=2000 number;  /* output row number */
select * from orderline; * Title and footnote lines ;
quit;


proc sql nostimer outobs=2;
create table aa as
select * from orderline; * Title and footnote lines ;
select * from aa;
quit;
proc sql stimer;
select * from DICTIONARY.TITLES; * Title and footnote lines ;
quit;
proc sql;
select * from DICTIONARY.EXTFILES; * Filerefs ;
quit;
proc sql;
select * from DICTIONARY.MEMBERS; * SAS files ;
quit;
proc sql;
select * from DICTIONARY.CATALOGS; * Catalogs ;
quit;

%macro adt125;
%put "dictionary.macros";
%global userMacro;
%let userMacro='abcefg';
%mend;
%adt125;
proc sql;
create table mcr as
select * from DICTIONARY.MACROS; * Macros ;
select * from mcr where upcase(name)='USERMACRO';
quit;


proc sql;
select * from DICTIONARY.tables; * Macros ;
quit;

proc contents data = mcr;
run;
/*
GLOBAL                            SQLOBS                                   0  0
GLOBAL                            SQLOOPS                                  0  0
GLOBAL                            SYS_SQL_IP_ALL                           0  -1
GLOBAL                            SYS_SQL_IP_STMT                          0
GLOBAL                            SQLXOBS                                  0  0
GLOBAL                            SQLRC                                    0  0
GLOBAL                            DT                                       0  1827
GLOBAL                            SQLEXITCODE                              0  0
GLOBAL                            USERMACRO                                0  'abcefg'
AUTOMATIC                         AFDSID                                   0  0
AUTOMATIC                         AFDSNAME                                 0
AUTOMATIC                         AFLIB                                    0
AUTOMATIC                         AFSTR1                                   0

*/

/*
AUTOMATIC                         SYSODSPATH                               0   SASUSER.TEMPLAT
                                                                              (UPDATE)
                                                                              SASHELP.TMPLMST
                                                                              (READ)
AUTOMATIC                         SYSPARM                                  0
AUTOMATIC                         SYSPBUFF                                 0
AUTOMATIC                         SYSPROCESSID                             0  41D7A1DEBC9B126F401
                                                                              8000000000000
AUTOMATIC                         SYSPROCESSNAME                           0  DMS Process
AUTOMATIC                         SYSPROCNAME                              0  SQL
AUTOMATIC                         SYSRC                                    0  0
AUTOMATIC                         SYSSCP                                   0  WIN
AUTOMATIC                         SYSSCPL                                  0  W32_VSPRO
AUTOMATIC                         SYSSITE                                  0  0070003549
AUTOMATIC                         SYSSIZEOFLONG                            0  4
AUTOMATIC                         SYSSIZEOFUNICODE                         0  2
AUTOMATIC                         SYSSTARTID                               0
AUTOMATIC                         SYSSTARTNAME                             0
AUTOMATIC                         SYSTCPIPHOSTNAME                         0  BravEfcntCA
AUTOMATIC                         SYSTIME                                  0  18:05
AUTOMATIC                         SYSUSERID                                0  ch
AUTOMATIC                         SYSVER                                   0  9.2
AUTOMATIC                         SYSVLONG                                 0  9.02.02M2P090109
AUTOMATIC                         SYSVLONG4                                0  9.02.02M2P09012009
AUTOMATIC                         SYSWARNINGTEXT                           0   Apparent symbolic
                                                                              reference DT not
                                                                              resolved
*/

/* see a list of dataset in work lib */
proc sql;
select * from DICTIONARY.TABLES where UPCASE(LIBNAME)="WORK"; * SAS data files ;
quit;

proc sql noprint;
create table dtset as
select * from DICTIONARY.TABLES where UPCASE(LIBNAME)="WORK"; * SAS data files ;
quit;

proc contents data=dtset;
run;
/*
 #    Variable         Type    Len    Format       Informat     Label

 22    maxvar           Num       8                              Longest variable name
  5    memlabel         Char    256                              Data Set Label
  2    memname          Char     32                              Member Name
  3    memtype          Char      8                              Member Type
  8    modate           Num       8    DATETIME.    DATETIME.    Date Modified
 21    nlobs            Num       8                              Number of Logical Observations
  9    nobs             Num       8                              Number of Physical Observations
 15    npage            Num       8                              Number of Pages
 40    num_character    Num       8                              Number of Character Variables
 41    num_numeric      Num       8                              Number of Numeric Variables
 11    nvar             Num       8                              Number of Variables
 10    obslen           Num       8                              Observation Length
 17    pcompress        Num       8                              Percent Compression
 12    protect          Char      3                              Type of Password Protection
 32    reqvector        Char     24    $HEX48.      $HEX48.      Requirements Vector
 18    reuse            Char      3                              Reuse Space
 31    sortchar         Char      8                              Charset Sorted By
 29    sortname         Char      8                              Name of Collating Sequence
 30    sorttype         Char      4                              Sorting Type
  6    typemem          Char      8                              Data Set Type
*/
/*get all dataset name in work lib*/
proc sql noprint;
create table dsNames as
select memname from DICTIONARY.TABLES where UPCASE(LIBNAME)="WORK"; * SAS data files ;
quit;
proc print data=dsNames;
run;


proc sql;
select * from DICTIONARY.VIEWS; * Views ;
quit;
/*select columns name of all tables.*/
proc sql;
select * from DICTIONARY.COLUMNS; * Variables in SAS datasets ;
quit;
proc sql;
select * from DICTIONARY.COLUMNS where  upcase(libname)="WORK" and
upcase(name)="ORDERLINE"
; * Variables in SAS datasets ;
quit;

proc sql;
select * from DICTIONARY.COLUMNS where  upcase(libname)="WORK";
* Variables in SAS datasets ;
quit;

proc sql;
select * from DICTIONARY.INDEXES; * Indexes;
quit;

/*understand the set operations*/
proc sql; select * from table1 except select * from table2;
proc sql; select * from table1 intersect select * from table2;
proc sql; select * from table1 union select * from table2;
proc sql; select * from table1 outer union select * from table2;
proc sql; select * from table1 outer union corr select * from table2;
proc sql; select * from table1 outer union CORRESPONDING select * from table2;
proc sql; select * from a union all select * from b;
proc sql; select * from a union  select * from b;

proc contents data=sales.orders nods;
run;
/*
3    campaignid       Num       8    BEST12.      BEST32.
5    city             Char     50    $50.         $50.
2    customerid       Num       8    BEST12.      BEST32.
*/

proc contents data=sales.customer nods;
run;
/*
1    customerid     Num       8    BEST12.    BEST32.
4    firstname      Char     50    $50.       $50.
3    gender         Char      1    $1.        $1.
2    householdid    Num       8    BEST12.    BEST32.
*/
proc sql noprint;
create table C as
select *
from sales.Orders as A join sales.customer as B
on A.customerId = B.customerid ;
select * from C where monotonic() between 1 and 2;
quit;
proc sql;
select * from C where monotonic() between 1 and 2;
quit;

data ad5_a;
input IDnum Jobcode $ Salary Bonus;
cards;
1400 ME1 29769 587
1403 ME1 28072 342
1120 ME1 28619 986
1653 ME2 35108 987
;
run;
data ad5_b;
input IDnum Jobcode $ Salary;
cards;
1653 ME2 35108
1782 ME2 35345
1244 ME2 36925
1120 ME1 28619
;
run;

proc contents data=ad5_a;
run;

proc contents data=ad5_a;
run;
/*Using the Keyword CORR with the OUTER UNION Operator*/
proc sql;
select * from ad5_a , ad5_b;
quit;
proc sql;
create table ad5_botha as
select coalesce(ad5_a.IDnum, ad5_b.IDnum)  as ID
from ad5_a outer union select * from ad5_b
where ad5_a.IDnum=ad5_b.IDnum;
quit;

proc sql;
create table ad5_botha as
select ad5_a.IDnum, ad5_b.IDnum  as ID
from ad5_a outer union select * from ad5_b;
quit;

proc sql;
create table ad5_bothb as
select * from ad5_a union select * from ad5_b;
quit;

/*add together both*/
proc sql;
create table ad5_bothc as
select * from ad5_a union all select * from ad5_b;
quit;
proc sql;
create table ad5_bothd as
select * from ad5_a union corr select * from ad5_b;
quit;

/*proc sql; select * from table1 except select * from table2;*/
proc sql;
create table ad5_bothf as
select * from ad5_a except corr select * from ad5_b;
quit;

proc sql;
create table ad5_c as
select IDnum, Jobcode, Salary from ad5_a;
quit;

proc sql; select * from table1 intersect select * from table2;
proc sql;
create table ad5_bothg as
select * from ad5_c intersect select * from ad5_b;
quit;

proc sql;
create table ad5_bothh as
select * from ad5_c intersect all select * from ad5_b;
quit;

proc sql;
create table ad5_bothi as
select * from ad5_c intersect corr select * from ad5_b;
/* must have same column */
quit;

proc sql;
create table ad5_bothi as
select * from ad5_c outer union select * from ad5_b;
/* total record = C+D, result depend on the first table */
quit;

proc sql;
create table ad5_bothj as
select * from ad5_b outer union select * from ad5_c;
/* total record = C+B,  result depend on the first table */
quit;

proc sql;
create table ad5_bothi as
select * from ad5_a outer union select * from ad5_b;
/* total record = A+D: first set all ,second nothing. */
quit;
proc sql;
create table ad5_bothi as
select * from ad5_a union select * from ad5_b;
/* total record = A+D : order by mixed way */
quit;

/*proc sql; select * from table1 outer union corr select * from table2;*/
proc sql;
create table ad5_bothj as
select * from ad5_a union corr select * from ad5_b;
/* total record = A+B, only the matched columns : order by mixed way */
quit;

/*proc sql; select * from table1 outer union CORRESPONDING select * from table2;*/
proc sql;
create table ad5_bothh as
select * from ad5_a union CORRESPONDING select * from ad5_b;
/* total record = A+B, only the matched columns : order by mixed way */
quit;

proc sql;
create table ad5_bothk as
select * from ad5_a union all select * from ad5_b;
/* total record = add both together */
quit;


proc sql;
create table ad5_bothk as
select * from ad5_c union all select * from ad5_b;
/* total record = add both together */
quit;

/*Exists: Tests if a subquery returns one or more rows. The result of
an EXISTS condition is true if the subquery resolves to at least one row.*/

proc sql;
select * from ad5_a
where exists(select Jobcode from ad5_a where bonus > 900);
quit;
/*
IDnum  Jobcode     Salary     Bonus
----------------------------------
1400  ME1          29769       587
1403  ME1          28072       342
1120  ME1          28619       986
*/

proc sql;
select * from ad5_a
where exists(select Jobcode from ad5_a where bonus > 7900);
quit;
/*No rows were selected.*/


proc sql;
select * from ad5_a
where IDnum in (select Idnum from ad5_a where bonus > 900);
quit;


proc sql;
select * from ad5_a ;
quit;
/*CASE:
case()
when 1 then 1
when 2 then 2
else then 3
end

*/
proc sql;
select IdNum,Salary, jobcode,
case (Idnum)
when 1400 then 'Grade1'
when 1403 then 'Grade2'
when 1120 then 'Grade3'
else 'No Grade'
end as JobLevel
from Ad5_a;
quit;


proc sql;
select a.IdNum,b.Salary, c.jobcode,
case (a.Idnum)
when 1400 then 'Grade1'
when 1403 then 'Grade2'
when 1120 then 'Grade3'
else 'No Grade'
end as JobLevel label 'Level of Grade'
from Ad5_a as a, Ad5_b as b, Ad5_c as c;
quit;

/*Join*/
/*Combining data Using PROC SQL*/;
data ad5_6a;
input id1 $ d $ @@;
cards;
key01 data01 key01 data01 key02 data03
; run;
data ad5_6b;
input id2 $ n $ @@;
cards;
key01 name01 key01 name02 key03 name03
;
run;
proc sql ;
select id1 label='the label' format=$20. from ad5_6a;
quit;

proc sql ;
select id1 label='the label' format=$10. from ad5_6a;
quit;


proc sql ; /* add constant to report */
select 'ID is:', id1 label='the label' format=$10. from ad5_6a;
quit;


proc sql ;
title 'The title of search report';
title2 'The second title of search report - sub title';
select id1 label='the label' format=$5. from ad5_6a;
footnote 'author: Duke Chen, 2014';
footnote2 'In Toronto';
quit;

proc sql;
create table ad5_6both as
select a.id1, a.d, b.n from ad5_6a as a inner join ad5_6b as b on a.id1=b.id2;
/*inner join*/;
select * from ad5_6both;
quit;


proc sql;
create table ab_both as
select a.id1, a.d, b.n from ad5_6a as a inner join ad5_6b as b on a.id1=b.id2;
select * from ab_both;
/*inner join*/;
/*When you specify multiple tables in the FROM clause but do not include a
WHERE statement to subset data, PROC SQL returns the Cartesian product of
the tables.

In a Cartesian product, each row in the first table is combined with every
row in the second table.*/
quit;

proc sql;
create table ab_both as
select a.id1, b.id2, a.* , b.*
from ad5_6a as a inner join ad5_6b as b on a.id1=b.id2;
select * from ab_both;
quit;


proc sql;
create table ab_both as
select a.* , b.*
from ad5_6a as a full join ad5_6b as b on a.id1=b.id2;
select * from ab_both;
quit;

/*data coalesce: combine two keywords as one kwords
cleanseen
*/
proc sql;
create table ab_both as
select coalesce(ad5_6a.id1, ad5_6b.id2) as id
from ad5_6a as a full join ad5_6b as b on a.id1=b.id2;
select * from ab_both;
quit;



proc sql;
select a.id1, b.id2
from ad5_6a as a left join ad5_6b as b on a.id1=b.id2;
quit;

/*describe*/
proc sql;
describe table ad5_6a, ad5_6b;
quit;

proc sql;
describe table sales.orders;
quit;

/*
create table SALES.ORDERS( compress=CHAR  bufsize=8192 )
  (
   orderid num format=BEST12. informat=BEST32.,
   customerid num format=BEST12. informat=BEST32.,
   campaignid num format=BEST12. informat=BEST32.,
   orderdate num format=YYMMDD10. informat=YYMMDD10.,
   city char(50) format=$50. informat=$50.,
   state char(2) format=$2. informat=$2.,
   zipcode char(50) format=$50. informat=$50.,
   paymenttype char(2) format=$2. informat=$2.,
   totalprice num format=BEST12. informat=BEST32.,
   numorderlines num format=BEST12. informat=BEST32.,
   numunits num format=BEST12. informat=BEST32.
  );
create index orderid on SALES.ORDERS(orderid);
*/

/*
Comparing SQL Joins and Data Step Matching-Merge What's a full / inner/ left / right join
Full join: ?if a or b;
Inner join: ?if a and b;
Left join: ?if a;
Right join: ?if b;
*/

/*
---------- The difference between union and jion ------------
Jion can remove duplicate Rows
Union will not remove duplicate rows
Join combine data record horizontally
Union combines data record vertically
*/


/*The Main Advantages and Disadvantages of PROC SQL join
1. Advantages
-PROC SQL joins do not require sorted or indexed tables.
-PROC SQL joins do not require that the columns in join
expressions have the same name. -PROC SQL joins can use comparison operators.
-Multiple data sets can be joined in one step without having common variables
in all data sets.
-You can create data sets (tables), views, or query reports
with the combined data.

2. Disadvantages
-The maximum number of tables that can be joined at one time is 32.
-Complex business logic is difficult to incorporate into the join.
-PROC SQL might require more resources than the DATA step with the
MERGE statement for simple joins.*/

/*The Main Advantages and Disadvantages of DATA STEP Match-Merge
1. Advantages -
There is no limit to the number of input data sets, other than memory.
-Allows for complex business logic to be incorporated into the new data
set by using DATA step processing, such as arrays and DO loops,
in addition to MERGE features.
-Multiple BY variables enables lookups that depend on more than
one variable. Disadvantages
-Data sets must be sorted by or indexed on the BY variable(s) prior to merging.
-The BY variable(s) must be present in all data sets, and the
names of the key variable(s) must match exactly.
-An exact match on the key value(s) must be found.*/


/*like statement*/
/*create a new table that has the same columns and attributes as an existing
table, but has no rows.*/
proc sql;
create table ad5_7a like ad5_6a;
quit;
proc contents data=ad5_7a;
run;
proc sql;
insert into ad5_7a
set d='aaaaa    ' , id1='1234', d='0       ', id1='1345',
d='aaccc    ' , id1='2345', d='        ', id1='3456'
;
quit;
/*create table*/
proc sql;
create table work.discount
(Destination char(3),
BeginDate num Format=date9.,
EndDate num format=date9.,
Discount num);
describe table work.discount;
quit;

proc sql;
create table work.flightdelays3 (drop=discount enddate)
like discount;
quit;

/*create table from  an existed table*/
proc sql;
create table work.flightdelays (keep=discount enddate)
like discount;
quit;
/*copy a table*/
proc sql;
create table work.flightdelays as select * from discount;
quit;




proc sql;
update  ad5_7a
set d='aaaaa' , id1='1234'
where id1='1234';
quit;


proc sql;
update ad5_7a
set d='azs' || case
when id1='1234' then '-x--x'
when id1='1345' then 'xdfg'
when id1='2345' then 'xxsx'
when id1='3456' then 'xdkd'
else '34444' end;
select * from ad5_7a;
quit;
data b70_42d3;
    X = '04JUL97'D ;
Y = '09:00'T ;
Time = '04JUL97:12:00'DT ;
run;
proc sql;
update b70_42d3 set Time = Time + '0:30't;
select * from b70_42d3 ;
quit;
/*Drop table*/
proc sql;
drop table b70_42d3; * drop a table/dataset ;
quit;

/*Alter table: modify, add, drop*/
proc sql;
alter table ad5_7a
modify d char length=30 label='Label D' ;
select * from ad5_7a;
quit;
proc sql;
alter table ad5_7a
modify d char length=30 label='Label D'
add Name Char length=20 label="Employee Name"
add Salary Num format=dollar8.
;
Insert into ad5_7a set
Name='John', Salary=34555;
select * from ad5_7a;
quit;
proc sql;
insert into work.discount
set destination='LHR',
begindate='01MAR2000'd,
enddate='05MAR2000'd,
discount=.33 /* one row */
set destination='CPH',
begindate='03MAR2000'd,
enddate='10MAR2000'd,
discount=.15; /* another row */
quit;

proc sql;
insert into work.discount (destination,begindate,enddate,discount)
values ('LHR','01MAR2000'd,'05MAR2000'd,.33)
values ('CPH','03MAR2000'd,'10MAR2000'd,.15);
quit;
proc sql;
alter table ad5_7a
modify d char length=30 label='Label D' format $10.
add Name Char length=20 label="Employee Name"
add Salary Num format=dollar8.
;
Insert into ad5_7a set
Name='John', Salary=34555;
select * from ad5_7a;
quit;
/*
id1 Label D Employee Name   Salary
3456 azsxdkd   .
   John $34,555
*/
proc sql;
alter table ad5_7a
modify d char length=30 format $5. label='Label D'
add Name Char length=20 label="Employee Name"
add Salary Num format=dollar8.
;
Insert into ad5_7a set
Name='John', Salary=34555;
select * from ad5_7a;
quit;
/*
id1 Label D Employee Name Salary
3456 azsxd   .
    John $34,555
    John $34,555
 */

proc sql;
alter table ad5_7a
modify id1  label='StudentID';
select * from ad5_7a;
quit;


/*use SQL create index: the benefit is when record < 15%, then fast */
proc sql;
create index id1 on ad5_7a(id1);
select * from ad5_7a;
quit;

proc sql;
create index id1_salary on ad5_7a(id1, salary);
quit;
/*delete index, remove index, drop index*/
proc sql;
drop index id1 from ad5_7a;
quit;

/*create macro variable by proc sql*/
proc sql;
select count(id1) into :id1count from ad5_7a ;
quit;
%put ID Count id1count= &id1count;

%let today2='20JAN2001';
data dt;
td = &today;
td2 = &today2;
run;
proc contents data=dt;
run;

/*****  Add constraint    */
proc contents data=ad5_6a;
run;
PROC SQL;
ALTER TABLE ad5_6a
ADD CONSTRAINT d CHECK (d IN ('Laptop', 'Phone', 'Software', 'Workstation'));
QUIT;


%macro check2(num=4, lens=5);
%let result=%sysevalf(&num+0.5);
%let total = %eval(%eval(&num) + %eval(&lens));
%put result is &result, total is &total;
%mend;
%check2(num=9,lens=5);
%check2();


%macro adt6_1(var);
%let jobs=black-smith wordsmith swordsmith;
%let type=%index(&jobs, &var);
%put type = &type;

%mend;
%adt6_1(smith);

%macro adt6_2(caseIn);
phoneStr1 = "(908)235-4490"; /*var not acceptable in macro*/
phoneStr2 = "(201) 555-77 99";
phoneStr2 = "(201] 555-77 99";
addressStr = "Ron    Cody          89   Lazy   Brook  Road   ";
put phoneStr1= phoneStr2= ;
%mend;
%adt6_2;


%macro adt6_2(caseIn);
%let phoneStr1 = "(908)235-4490"; /*var not acceptable in macro*/
%let phoneStr2 = "(201) 555-77 99";
%let phoneStr2 = "(201] 555-77 99";
%let addressStr = "Ron    Cody          89   Lazy   Brook  Road   ";
%put &phoneStr1 &phoneStr2= ;
%mend;
%adt6_2;


%macro adt6_2(caseIn);
%let phoneStr1 = "(908)235-4490"; /*var not acceptable in macro*/
%let phoneStr2 = "(201) 555-77 99";
%let phoneStr2 = "(201] 555-77 99";
%let addressStr = "Ron    Cody          89   Lazy   Brook  Road   ";
%let address=compbl(&addressStr);  /*not acceptable*/
%put &phoneStr1 &phoneStr2 &address ;
%mend;
%adt6_2;

%macro adt7_1(num=4);
%let result=%eval(&num gt 5);
%put result is &result;
%mend;
%adt7_1(num=10);


%macro adt7_1(num=4);
%let result=%eval(&num gt 5);
%put result is &result;
%mend;
%adt7_1(num=1);

%let a = 0.5;
%let b = 5;
%let c = %eval(&a + &b);
%put &c;

%let c = %eval(&b + &a);
%put &c;


%let c = %eval(&b);
%put &c;

%let c = %sysevalf(&b+&a);
%put &c;


%macro adt7_1(num=4);
%let result=%eval(&num + 5);
%put result is &result;
%mend;
%adt7_1(num=10);


%macro adt7_1(num);
%let result=%eval(&num * 5);
%put result is &result;
%mend;
%adt7_1(10);

data adt8_1;
length a 1 b 1 x 1;   /* Error:  The length of numeric variables is 3-8*/
infile datalines;
input a b x;
datalines;
1 2 3
4 5 6
;
run;


data adt8_1;
length a 8 b 3 x 8;  
infile datalines;
input a b x;
datalines;
1 2 3
4 5 6
;
run;


data adt8_1;
length 1 b 3 x ;  /*compile error, data step stopped, no dataset created*/
infile datalines;
input a b x;
datalines;
1 2 3
4 5 6
;
run;


data adt8_1;
length a 8 b 3 x ;  /*compile error, data step stopped, no dataset created*/
infile datalines;
input a b x;
datalines;
1 2 3
4 5 6
;
run;


data adt8_1;
length a 8 b 3 x 8 ;  /* dataset created*/
infile datalines;
input a b x;
datalines;
1 2 3
4 5 6
;
run;


data adt8_1;
length a 8 b 3 x 8 ;  /* has error reading data, but dataset created*/
infile datalines ;
input a b x;
datalines;
1 2 3
4 5
;
run;


data adt10_1;
input status $ count;
datalines;
serious 31
not 1419
serious 191
not 2004
Not 2004
Not 2004
Not 2004
serious 216
;
run;
proc print data=adt10_1;
where status='Not' or status='not';
run;
%macro adt10_highway;
proc sql;
%let numgrp=6;
select distinct status into :group1-:group&numgrp from adt10_1;
quit;
%put &group1 &group2 &group3 &group4 &group5 &group6;
%do i=1 %to &numgrp;
%put "&&group&i";  /* output macro loop var*/
%end;
%mend;
%adt10_highway;



data _null;
call symput('group1', ''); /*define global macro variable*/
call symput('group2', ''); /*define global macro variable*/
call symput('group3', ''); /*define global macro variable*/
run;
%macro adt10_highway;
proc sql;
select distinct status into :group1-:group3 from adt10_1;
quit;
%mend;
%adt10_highway;
/*result: group1=not, group2=serious*/
%put &group1 &group2 &group3;

%put "&group1";
%macro adt10_2; * output;
%do i=1 %to 2;
%put "&&group&i";  /* output macro loop var*/
%end;
%mend;
%adt10_2;

%macro adt10_3; * output;
%do i=1 %to 2;
%put "&group&i";  /* output macro loop var*/
%end;
%mend;
%adt10_3;

%macro adt10_4; * output;
%do i=1 %to 2;
%put '&group&i';  /* output = '&group&i'  */
%end;
%mend;
%adt10_4;

%macro adt10_5; * output;
%do i=1 %to 2;
%put "&group&i";  /* output = "Not/Serious", with warning */
%end;
%mend;
%adt10_5;


%macro adt10_7; * output;
%do i=1 %to 2;
%put &&group&i;  /* output macro loop var: not , serious */
%end;
%mend;
%adt10_7;

%macro adt10_8;
%let i=1;
%let j = 2;
%let k = %Eval(&i*&j);
%put &k;
%mend;
%adt10_8;


%macro adt10_8;
%let i=1;
%let j=0;
%do i=1 %to 100;
&j=%eval(&j+&i);
%end;
/* error*/
%put &j;
%mend;
%adt10_8;

%macro adt10_8;
%let i=2;
%let j=10;
%do i=1 %to 2;
&j=%eval(%eval(&j)+%eval(&i));  /*error*/
%put &j;
%end;
/* error*/

%mend;
%adt10_8;

%macro adt10_8;
%let i=2;
%let j=10;
%do i=1 %to 2;
j=j+i;  /*error*/
%put &j;
%end;
/* error*/

%mend;
%adt10_8;

%macro adt10_4;
%Let today = %sysevalf('01May2013'd);
%Let day_of_week = %sysfunc(weekday(&today));
%let str=%eval(342333);
%let str1 = %sysfunc(substr(&str, 1,4));
%let today1 = %sysfunc(today());

%let today2 = %sysevalf('01May2013'd);
%Let this_week_begin = %eval(&today - &day_of_week + 1);
%Let this_week_end = %eval(&today - &day_of_week + 7);
%put &today &day_of_week &this_week_begin &str1;
%put &today1 &today2;
%put D_Begin: &D_Begin %sysfunc(putn(&D_Begin,mmddyy10. ));
%put today : &today %sysfunc(putn(&today ,weekdate29.));
%put D_End : &D_End %sysfunc(putn(&D_End ,date9. ));
%mend;
%adt10_4;

/*
syntax: intnx(interval in (day, week, month, quarter, year)
,start-from: an integer of a date
,increment: an integer: negative zero or positive
,alignment in (begin, middle, end, same);
*/

data _intnx;
*monInterval=month(intnx('month',today(),-1));
monInterval=intnx('year',today(),-4);  /* 2006 = 2010-4*/
y = put(monInterval, date9.); * 3;
put y=;
monInterval=intnx('year',today(),+4);  /* 2014 = 2010+4*/
y = put(monInterval, date9.); * 3;
put y=;
monInterval2=month(intnx('month',today(),1));
put monInterval2; * 5 ;
monInterval3=month(intnx('month',today(),0));
put monInterval3;
run;

%macro adt10_5;
%let today = %sysevalf('01May2013'd);
********** by month --> then start of month ;
%let interval = month;
%let d_start_byMon =%sysfunc(intnx(&interval.,&today,-1,begin));
%let d_stop_byMon =%sysfunc(intnx(&interval.,&today,-1,end ));
%put &d_start_byMon  &d_stop_byMon;
**************** by week ---> then start of week and end of week;
%let interval = week;
%let d_start_byWk =%sysfunc(intnx(&interval.,&today,-1,begin));
%let d_stop_byWk =%sysfunc(intnx(&interval.,&today,-1,end ));
%put &d_start_byWk  &d_stop_byWk;
%let interval = week.1;  /*start week on sunday*/
%let d_start_byWk =%sysfunc(intnx(&interval.,&today,-1,begin));
%let d_stop_byWk =%sysfunc(intnx(&interval.,&today,-1,end ));
%put &d_start_byWk  &d_stop_byWk;
%let interval = week.2;  /*start week on Monday*/
%let d_start_byWk =%sysfunc(intnx(&interval.,&today,-1,begin));
%let d_stop_byWk =%sysfunc(intnx(&interval.,&today,-1,end ));
%put &d_start_byWk  &d_stop_byWk;
%mend;
%adt10_5;



/*scan function: break one var to many */
/*this= scan('this is a book', 1, ' ');*/
%macro mScan(inp);
%if %scan(&inp, 1) eq 'abc' %then
%put 'abc';
%else
%put 'None-abc';

%mend;
%mScan('abc');

%macro mScan(inp);
%if %scan(&inp, 2, '') eq 'abc' %then
%put 'abc';
%else
%do;
%* %put 'None-abc';
%put %scan(&inp, 2, ' ');
%end;
%mend;

%mscan('abb bcd abc');
/*
macro replacement:
SAS macros are evaluated before compile time:
Macro Code --> Macro Processor --> replaces macro references with actual values -->
--> compiler --> compile to machine code --> excute code
macro statement : ----------> start by %
macro varible --------------> start by &
*/

/*define macro var for repet usage purpose*/
%LET CPTCODE = '21081';
DATA adt10_6a;
SET SAS.DATASET;
WHERE CPT_CODE = &CPTCODE;
RUN;
DATA adt10_6b;
SET SAS.DATASET;
WHERE CPT_CODE = &CPTCODE;
RUN;

/*call macro variable in right way*/
%LET CPTCODE1 = 21081;
%LET CPTCODE2 = '21081';
DATA adt10_6c;
cptcode1= "&CPTCODE1";
cptcode2= &CPTCODE2;
cptcode3 = &CPTCODE1;
put cptcode1= "&CPTCODE1";
put cptcode2= &CPTCODE2;
put cptcode3= &CPTCODE3;
RUN;
proc contents data=adt10_6c;
run;
/*both cptcode are char, cptcode3 is num*/

/*proc format cntlin: use cntlin to create format from dataset */
data adt12_1;
fmtname  = 'priceFmt';
Start = '';
Label = 'Price';
run;
proc format cntlin=adt12_1;
run;
proc format library=fmtlibs fmtlib;
run;

/* create output format dataset */
proc format library=fmtlibs cntlout=ad50_38;
run;
proc print data=ad50_38;
run;
proc contents data=ad50_38;
/* ad50_38.sas7bdat , sas7bdat */
run;
data adt12_2format;
input fmtname $ start $ label $;
datalines;
priceFmt . price
ageFmt . age
timeFmt . time
;
run;
proc format cntlin=adt12_2format;
run;

proc sql;
create view adt12_2view as
select fmtname, label from adt12_2format;
quit;

proc sql;
create view adt12_2view as
select fmtname from adt12_2format;
quit;

proc sql;
create view adt12_2view as
select * from sales.orders;
quit;


data view = adt12_2view;
describe;
run;
data view=sauser.ranch;
describe;
run;
proc datasets;
run;
/*ERROR: The view WORK.ADT12_2VIEW is either corrupt or not a DATA step view*/


proc sql;
describe view adt12_2view;
quit;

proc sql;
describe table adt12_2format;
quit;

/*
SQL table WORK.ADT12_2FORMAT was created like:
create table WORK.ADT12_2FORMAT( bufsize=4096 )
  (
   fmtname char(8),
   start char(8),
   label char(8)
  );
*/

%let a=cat;
%macro adt19_animal(a=frog);
%let a=bird;
%put a is &a;
%mend;
%adt19_animal(a=pig);
%put a is &a;


%macro adt19_animal(x=frog);
%global a;
%global b;
%let a=bird;
%put a is &a;
%mend;
%adt19_animal(x=pig);
%put a is &a;

%macro adt19_animal(a=frog);
%global a;     * EROOR:  Attempt to %GLOBAL a name (A) which exists in a local environment.;
%let a=bird;
%put a is &a;
%mend;
%adt19_animal(a=pig);
%put a is &a;


proc sql;
select nvar from dictionary.tables where libname='SAUSER';
quit;


proc sql;
select nvar from dictionary.tables where libname='WORK';
quit;

proc sql;
create table tb as
select * from dictionary.tables where libname='WORK';
quit;

proc contents data=tb;
run;
/*
 22    maxvar           Num       8                              Longest variable name
  5    memlabel         Char    256                              Data Set Label
  2    memname          Char     32                              Member Name
  3    memtype          Char      8                              Member Type
  8    modate           Num       8    DATETIME.    DATETIME.    Date Modified
 21    nlobs            Num       8                              Number of Logical Observations
  9    nobs             Num       8                              Number of Physical Observations
 15    npage            Num       8                              Number of Pages
 40    num_character    Num       8                              Number of Character Variables
 41    num_numeric      Num       8                              Number of Numeric Variables
 11    nvar             Num       8                              Number of Variables
 10    obslen           Num       8                              Observation Length
 17    pcompress        Num       8                              Percent Compression
 12    protect          Char      3                              Type of Password Protection
 32    reqvector        Char     24    $HEX48.      $HEX48.      Requirements Vector
 18    reuse            Char      3                              Reuse Space
 31    sortchar         Char      8                              Charset Sorted By
 29    sortname         Char      8                              Name of Collating Sequence
 30    sorttype         Char      4                              Sorting Type
  6    typemem          Char      8                              Data Set Type
*/


proc sql;
select typemem, memname, nvar, reuse from dictionary.tables where libname='WORK' and memname='TB';
quit;
proc sql;
select nvar, * from dictionary.tables where libname='WORK';
quit;


proc sql;
select memname, nvar from dictionary.tables;
quit;


proc sql;
select * from dictionary.tables;
quit;


proc sql;
select unique(libname) from dictionary.tables;
quit;
/*
Library Name
-------------------------------
MAPS
SASHELP
SASUSER
WORK
*/
proc sql;
select * from dictionary.tables;
quit;

proc sql;
select nvar, memname from dictionary.tables where libname='WORK';
quit;
/* Number of
Variables  Member Name
    -------------------------------
1  ADT79
3  ADT87A
3  ADT87B
10  ADV6_12
10  ADV6_12B
1  ADV6_12C
1  ADV6_12D
1  ADV6_12E
1  ADV6_12F
2  ONE
8  ORDERLINE
2  TWO
*/

%macro adt22_loop;
%do i=1 %to 3;
   %put 'adc' &i;
%end;
%mend;
%adt22_loop;

Options  MLOGIC =  Yes; * Yes;* No--- Parse the macro logics;
Options Symbolgen = Yes; * yes; * display the value of a macro variable in SAS log ;
Options Mprint = No; * Yes; * yes;  /* print macro code */

/*
The NOSYMBOLGEN option suppresses macro variable resolution messages.
The NOMLOGIC option suppresses messages that trace macro execution activity including
the beginning of macro execution */


Options  MLOGIC =  yes; * Yes;* No--- Parse the macro logics, explain logic ;
Options Symbolgen = yes;  * only explain the macro vairables value ;
Options Mprint = no; * Yes; * yes;  /* print macro code */

options NOSYMBOLGEN ;
options nomlogic;
options NOMAUTOSOURCE;
options NOMAUTOLOCDISPLAY;
options MCOMPILENOTE=ALL;
/*The macro ADT22_LOOP completed compilation without errors. 5 instructions 68 bytes.*/
%macro adt22_loop;
%do i=1 %to 3;
   %put  abc &i;
%end;
%mend;
%adt22_loop;
/*SYMBOLGEN:  Macro variable I resolves to 1*/

data sWire.adt30;
input a;
datalines;
1
2
3
;
run;


options mprint = yes;
%macro adt22_mprint(par);
proc &par data=sWire.adt30;
run;
%mend;
%adt22_mprint(print);
/* add these infor to log:
MPRINT(ADT22_MPRINT):   proc print data=sWire.adt30;
MPRINT(ADT22_MPRINT):   run;
*/
options mprint = no;
%adt22_mprint(print);
/*NOTE: There were 3 observations read from the data set SWIRE.ADT30*/
options mprint;
%adt22_mprint(print);
/*
MPRINT(ADT22_MPRINT):   proc print data=sWire.adt30;
MPRINT(ADT22_MPRINT):   run;
*/

options mprint = no;
%macro adt22_msort(par);
proc &par data=sWire.adt30;
by a;
run;
%mend;
%adt22_msort(sort);

options mprint = yes;
%adt22_msort(sort);
/*
MPRINT(ADT22_MSORT):   proc sort data=sWire.adt30;
MPRINT(ADT22_MSORT):   by a;
MPRINT(ADT22_MSORT):   run;
*/


data adt51;
input salary;
datalines;
234
556
.
897
.
.
;
run;
proc sql;
select * from adt51 where salary is not missing;
quit;

proc sql;
select * from adt51 where salary is not null;
quit;


proc sql;
select * from adt51 where salary ne missing;  /* errro : ne not support*/
quit;

/*load data into memory, to speed up the procedures*/
sasfile sWire.adt30 load;
/*load into memory , one page at a time.*/
sasfile sWire.adt30 open;
sasfile sales.orderline load;

/*Options  MLOGIC = yes;  only show macro logic*/
data adt32;
set sWire.adt30(bufno=4 bufsize=4090);
run;
/*
BUFNO=, and BUFSIZE System Options
The following SAS system options can help you reduce the number of disk
accesses that are needed for SAS files, though they might increase memory usage.
*/

Option reuse=yes;
/*
it tracks and recycles free space
it specifies whether new observations are written to free space in
compressed SAS data sets.*/


data adt39_1;
input salary;
datalines;
2333
4562
3456
;
run;
proc sql;
select salary, salary*0.1 label='bonus'
from adt39_1;
quit;


proc sql;
create table sal as
select salary, salary*0.1 label='bonus'
from adt39_1;
quit;

proc sql;
create table sala2 as
select salary, salary*0.1 as bonus
from adt39_1;
select * from sala2;
quit;



proc sql;
describe table sal;
quit;
proc contents data=adt10_1;
run;

data adt41;
input x $ y @@;
cards;
P 23 P 34 A 34 A 45 R 45 R 56 R 56
;
run;
proc print data=adt41;
by x notsorted;
run;

proc print data=adt41;
by x ;
run;
/*
ERROR: Data set WORK.ADT41 is not sorted in ascending sequence. The current BY group has x =
       P and the next BY group has x = A.
*/
data adt53_proc;  ************** no conflict with the reserved keords ;
call symput('proc', 'means');
call symput('data', 'data');
call symput('sort', 'sort');
call symput('datasets', 'datasets');
call symput('contents', 'contents');
run;
%put &proc;
%put &sort;
%put &datasets;
%put &contents;
/* result:
293  %put &proc; means
294  %put &sort; sort
295  %put &datasets; datasets
296  %put &contents; contents
*/
%put _global_;
proc contents data=adt53_proc;
run;
%macro adt53_2;
call symput('prxoc', 'means');
%put 'xxxxxx';
%mend;
%adt53_2;



data adt58;
do i=1,2,3;
fileExt = i || '.txt';
put fileExt=;
end;
run;
data me;
a = "s s ss  s";
b = compress(a);
put b;
run;

/*filevar ?????????????????????????????????????????????????/*/
data adt58;
do i=1,3;
fileName = "&prjpath\sasimport\file" || put(i,1.) ||  ".txt";
fileName=compress (fileName);  /* remove blank*/
* put filename;
if (i=3) then
do;
infile temp filevar=fileName; * end=eof;
input dept $ sales;
end;
/* else if (i=1) then */ ?????????????????????????????????
/* do;*/
/* infile temp filevar=fileName; * end=eof;*/
/* input dept $ sales;*/
/* end; */
end;
run;

data adt58;
do i=1,2,3;
fileName = "&prjpath\sasimport\file" || put(i,1.) ||  ".txt";
fileName=compress (fileName);  /* remove blank*/
do until(isLast);
infile temp filevar=fileName end=isLast;
input dept $ sales;
output;
end;
end;
stop;
run;

data work.newroute;
set sasuser.rawdata;
infile in filevar = readit end = lastfile;
do while(lastfile = 0);
input @1 RouteID $7. @8 Origin $3. @11 Dest $3.
@14 Distance 5. @19 Fare1st 4.
@23 FareBusiness 4. @27 FareEcon 4.
@31 FareCargo 5.;
output;
end;
run;

data adt58;
fileName = "&prjpath\sasimport\file1.txt";
infile temp filevar=fileName;
input dept $ sales;
/* The infile TEMP is:
      Filename=E:\ChenyxFiles\2014.Study.TrainingCertificate\SAS.Victoria\SAS.Exam\SAS.Advance.Co
      de\sasimport\file1.txt,
*/
run;
/*Working with Lookup Values Outside of SAS Data Sets*/
/*
hard-code lookup values using:
?? the IF-THEN/ELSE statement
?? SAS arrays
?? user-defined SAS formats.
*/
data work.lookup1;
array Targets{1997:1999,12} $3 _temporary_;
if _n_=1 then
do;
array mnth{12} $3  ('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec');
do i= 1 to 3;
do j=1 to dim(mnth);
targets{i+1996,j}=mnth{j};
end;
end;
end;
run;
data work.quarter;
do i = 9, 10, 11;
nextfile="c:\sasuser\month"
!!put(i,2.)!!".dat";
infile temp filevar=nextfile;
input Flight $ Origin $ Dest $
Date : date9. RevCargo : comma15.2;
end;

data adt58;
infile "&prjpath\sasimport\file1.txt";
input dept $ sales;
run;
%macro adt58_vars;
dept sales
%mend;

proc print data=adt58;
var %adt58_vars;
run;

/*********8 adt54_1;*/
%let value=9;
%let add=5;
%let new=%eval(&value/&add);
%put &new;
***********  result = 1 *********;

data   adt69;
a = ranuni(1255553);
put a=;
run;


/*adv73    nrstr(): not resolve  */
%let value = 9;
%let str = %nrstr(this is A & B, the &value did not parsed);
%put &str &value;

/*adt76*/
/*
TAGSORT
stores only the BY variables and the observation numbers in temporary files.
The BY variables and the observation numbers are called tags.

At the completion of the sorting process, PROC SORT uses the tags to retrieve
records from the input data set in sorted order.
When the total length of BY variables is small compared with the record length,
TAGSORT reduces temporary disk usage considerably. However, processing time might
be much higher.
*/

/*   adt77: nobs=var --> number of total observations    */
data adt77;
a = ranuni(0);
set sales.orders nobs=n_obs;
if _N_ = 500 then
do;
put a=;
put n_obs=;
end;
if _error_ = 1 then
put "error";
else
output;
run;

/*adt78*/
proc sql noprint;
create table both as
select od.*, odl.*
from sales.orders od, sales.orderline odl
where od.orderid = odl.orderid;
select * from both where monotonic() between 1 and 2;
quit;


proc sql noprint;
create table both as
select od.*, odl.*
from sales.orders as od, sales.orderline as odl
where od.orderid = odl.orderid ;
select * from both where monotonic() between 1 and 2;
quit;

proc sql ;
create table both as
select coalesce(od.numunits, odl.numunits) as numunits,
coalesce(od.totalprice, odl.totalprice) as totalprice,
coalesce(od.orderid, odl.orderid) as orderid
from sales.orders as od, sales.orderline as odl
where od.orderid = odl.orderid ;
select * from both where monotonic() between 1 and 2;
quit;

proc contents data=sales.orders;
run;

proc contents data=sales.orderline;
run;

data adt79;
input a;
datalines;
12
;
run;
%put &SYSLAST;


/*****   adt80             ********/
data one;
input num country $;
datalines;
1 Canada
2 France
3 Germany
5 Japan
;
run;
data two;
input num city $;
datalines;
3 Berlin
5 Tokyo
4 Belgium
;
run;
proc sql;
select country from one
where not exists(
select * from two where one.num=two.num
);
quit;
/*
Canada
France
*/

proc sql;
select country from one

data adt79_house;
input da;
datalines;
2
;
run;
%put  &syslast;
/* result: default upper case: WORK.ADT79_HOUSE ************/


data adt82a;
input sales year;
datalines;
233 1998
;
run;

data adt82b;
input year budget sales $;
datalines;
1998 3000 200
;
run;

/*
A match-merge in
SAS means that records from the one file will be matched up with the records
of the second file that have the same
ID. The information in the matched records is combined to form
one output record
*/
data adt82c;
merge adt82a adt82b;
by year;
run;

/*modify sales to num;*/
data adt82b;
input year budget sales ;
datalines;
1998 3000 200
1999 500 600
;
run;


data adt82c;
merge adt82a adt82b;
by year;
run;

data adt82c;
merge  adt82b adt82a; /* if conflict, use the last data */
by year;
run;
/*
Obs year budget sales
1  1998  3000  233
*/

proc contents data=adt82b;
run;
/*modify year length ;*/
data adt82b;
length year 12;
input year budget sales ;
datalines;
1998 3000 200
1999 500 600
;
run;


data adt82d;
length year 18;
input year  ;
datalines;
1998
1999
2005
;
run;


data adt82c;
merge  adt82b adt82d; /* if conflict, use the last data */
by year;
run;

proc print data=adt82c;
run;
proc contents data=adt82c;
run;
/*
Obs year budget sales
1 1998 3000 200
2 1999 500 600
3 2005 .    .
*/

data adt82e;
input years  ;
datalines;
1998
1999
2005
;
run;

data adt82c;
merge  adt82b adt82e; /* if conflict, use the last data */
run;

/*
Obs year budget sales years
1 1998 3000 200 1998
2 1999 500 600 1999
3 . . . 2005
*/


data adt82c;
merge  adt82b adt82e; /* if conflict, use the last data */
by years;  /* even no common var, merge just spread vars */
run;
proc print data=adt82c;
run;
where exists(
select * from two where one.num=two.num
);
quit;
/*
Germany
Japan
*/

/*************** adt84*****substr(str, start, len) ****************************/
%let idcode=prod567;
%let codenum= %substr(&idcode, %length(&idcode)-2);
%put &codenum;

%let codenum= %substr(&idcode, %length(&idcode)-2);
%put &codenum %length(&idcode) %substr(prod567, 5);   * 567 ;
%put &codenum %length(&idcode) %substr(prod567, 6);   * 67 ;
%put &codenum %length(&idcode) %substr(prod567, 2,3); * rod ;
%put &codenum %length(&idcode) %substr(prod567, 2,2);


/*adt86*/
/* data view --> always access the most current data */

data adt87a;
input year qtr budget;
datalines;
2001 3 500
2001 4 400
2002 1 700
;
run;
data adt87b;
input year qtr sales;
datalines;
2001 3 300
2001 4 600
;
run;
proc sql;
select adt87a.*, sales
from adt87a, adt87b;
quit;

proc sql;
select adt87a.*, adt87b.*
from adt87a, adt87b;
quit;


proc sql;
select a.*, b.*
from adt87a as a, adt87b as b
where a.year = b.year;
quit;


proc sql;
select coalesce(a.year, b.year) as year,
  coalesce(a.qtr, b.qtr) as qtr,
  budget, sales
from adt87a as a, adt87b as b
where a.year = b.year;
quit;

proc sql;
select budget, sales
from adt87a, adt87b;
quit;


data adt93;
input group $ sum;
datalines;
A 765
B 123
C 564
;
run;
data _null_;
set adt93;
call symput(group, sum);
call symput('sign', sum);
run;
%put &A &B &C &sign;

data adt94;
do i=1 to 10000000;
a = ranuni(i);
b = ranuni(i+1000);
c = ranuni(i+10000);
output;
end;
run;
/*real time           7.48 seconds*/
/*cpu time            6.19 seconds*/

proc sql;
select * from adt94 where monotonic() between 1 and 100;
quit;
data adt94b;
set adt94;
if a > 0.5;
total = a+ b + c;
run;
/*
NOTE: The data set WORK.ADT94B has 4996789 observations and 5 variables.
NOTE: DATA statement used (Total process time):
      real time           5.39 seconds
      cpu time            4.67 seconds
*/

data adt94b;
set adt94;
where a > 0.5;
total = a+ b + c;
run;

/*
NOTE: The data set WORK.ADT94B has 4996789 observations and 5 variables.
NOTE: DATA statement used (Total process time):
      real time           6.11 seconds
      cpu time            5.66 seconds
*/

proc sql;
create table adt94c as
select *, (a+b+c) as total from adt94
where a > 0.5;
quit;
/*
Table WORK.ADT94C created, with 4996789 rows and 5 columns.
      real time           7.61 seconds
      cpu time            7.22 seconds
*/

data adt95;
input country $ city $ visit;
datalines;
USA Boston 10
UK London 5
USA Dallas 10
UK Marlow 10
USA Boston 20
UK London 15
USA Dallas 10
;
proc sql;
select country, city, sum(visit) as total
from adt95
group by country, city
order by country, total desc;
quit;


proc sql;
select country, city, sum(visit) as total
from adt95
group by country, city
order by 1, total desc;
quit;


proc sql;
select country, city, sum(visit) as total
from adt95
group by country, city
order by 1, 3;  /* 1=country, 3 = total */
quit;


proc sql;
select country, city, sum(visit) as total
from adt95
group by country, city
order by 2;  /* 2 = city */
quit;



proc sql;
select country, city, sum(visit) as total
from adt95
group by 2
order by 2;  /* 2 = city */
quit;

proc sql;
select country,sum(visit) as total
from adt95
group by country ;
quit;

proc sql;
select country, city, sum(visit) as total
from adt95
group by country ;
quit;


proc sql;
select country, city, sum(visit) as total
from adt95
group by country
order by country, city;
quit;


proc sql;
select city, sum(visit) as total
from adt95
group by city
order by city;
quit;

proc sql;
select country , city, sum(visit) as total
from adt95 ;
quit;

proc sql;
select country , city, sum(visit) as total
from adt95
group by visit
order by total desc;
quit;
proc sql;
select country , city, sum(visit) as total
from adt95
group by visit
order by total desc, country, city;
quit;


data adt96a;
input num char1 $;
datalines;
1 A
2 B
4 D
;
run;

data adt96b;
input num char2 $;
datalines;
2 X
3 Y
5 V
;
run;

data adt96c;
set adt96a adt96b;
run;
proc print data=adt96c;
run;

proc sql;
create table adt96c as
select * from one outer union corr
select * from two ;
select * from adt96c;
quit;

proc sql;
create table adt96c as
select * from one outer union
select * from two ;
select * from adt96c;
quit;

proc sql;
create table adt96c as
select * from one intersect
select * from two ;
select * from adt96c hh;
quit;


%put &RC &ERR ; * no such macro var ;
%put &SYSRC ; /*  Contains the last return code generated by your operating system. */
%put &SYSERR;
/*
Value Description
0 Execution completed successfully and without warning messages.
1 Execution was canceled by a user with a RUN CANCEL statement.
2 Execution was canceled by a user with an ATTN or BREAK command.
3 An error in a program run in batch or non-interactive mode caused SAS to enter syntax-check mode.
4 Execution completed successfully but with warning messages.
5 Execution was canceled by a user with an ABORT CANCEL statement.
6 Execution was canceled by a user with an ABORT CANCEL FILE statement.
>6 An error occurred. The value returned is procedure-dependent.

1008 General data problem
1012 General error condition
1016 Out-of-memory condition
1020 I/O problem
2000 Semantic action problem
2001 Attribute processing problem
3000 Syntax error
4000 Not a valid procedure
9999 Bug in the procedure
20000 A step was stopped or an ABORT statement was issued.
20001 An ABORT RETURN statement was issued.
20002 An ABORT ABEND statement was issued.
25000 Severe system error. The system cannot initialize or continue.
*/


data adt99;
input rep $ area $ cost;
datalines;
smith north 100
smith south 200
jones east 100
smith north 300
jones west 100
jones north 200
jones north 400
smith north 400
jones west 100
jones west 300
;
run;
proc sql;
select rep, area, count(*) as total
from adt99
group by rep, area;
quit;
/*
rep area total
jones east 1
jones north 2
jones west 3
smith north 3
smith south 1
*/



proc sql;
select rep, area, sum(cost) as total
from adt99
group by rep, area;
quit;
/*
rep area total
jones east 100
jones north 600
jones west 500
smith north 800
smith south 200

*/



data adt101;
input steering $ status $;
datalines;
absent serious
absent not
absent serious
absent not
absent serious
;
proc sql;
select count(distinct status) into :numgrp from adt101;
quit;
%put &numgrp; * = 2;

proc sql;
select count(status) into :numgrp from adt101;
quit;
%put &numgrp; * = 5;


proc sql;
select unique(status) into :numgrp from adt101;
quit;
%put &numgrp; * = not serious;


proc sql;
select count(unique(status)) into :numgrp from adt101;
quit;
%put &numgrp; * = 2;


data adt104;
input rep $ area $ cost;
datalines;
smith north 100
smith south 200
jones east 100
smith north 300
jones west 100
jones north 200
jones north 400
smith north 400
jones west 100
jones west 300
jones west 100
jones west 300
;
run;
proc sort data=adt104 out=adt104b nodup;
by rep ;
run;
proc print data=adt104b;
run;

proc sort data=adt104 out=adt104b nodupkey;
by rep;
run;
proc print data=adt104b;
run;

%put &sysday;
data adt109;
if &sysday = Tuesday then
put "sysday=Tuesday";
else
put "sysday ne Tuesday";

if "&sysday" = "Tuesday" then
put "sysday=Tuesday (2)";
else
put "sysday ne Tuesday";
run;

proc contents data=adt109;
run;


/*adt110:*/
/*data view contain partially compiled  data step*/

data adt111a;
input common $ x;
datalines;
A 10
A 13
B 14
B 9
C 8
C 14
;
run;


data adt111b;
input common $ y ;
datalines;
A 1
A 3
A 4
B 2
C 5
;
run;
/* key test point */
data adt111d;
set adt111a;
set adt111b;
run;

data adt111d;
set adt111a;
set adt111b;
by common;
run;

proc print data=adt111d;
run;

data adt111d;
set adt111a adt111b;
run;
proc print data=adt111d;
run;

data adt111c;
merge adt111a adt111b;
by common;
run;

proc print data=adt111c;
run;


data adt151c;
set adt111a ;
set adt111b;
run;
proc print data=adt151c;
run;
/*
Obs    common     x    y

1       A       10    1
2       A       13    3
3       A       14    4
4       B        9    2
5       C        8    5
*/


/*
Obs common x y
1 A 10 1
2 A 13 3
3 A 13 4
4 B 14 2
5 B 9 2
6 C 8 5
7 C 14 5
*/


%let start=%sysfunc(today(), weekdate.);
%let start=%sysfunc(today(), date9.);
%put &start;

/*adt152*/
%let name=%str(peter%'s restaurant);
%put &name;

%let name2= peter%str("'")s restaurant;
%put &name2;  * error;

%let name2= peter%str(%')s restaurant;
%put &name2;  * error;


%let name2= peter%str(%str(%'))s restaurant;
%put &name2;  * peter's restaurant;


%let name2= peter%str(A%')s restaurant;
%put &name2;  * peterA's restaurant;


%let name2= peter%str("%'")s restaurant;
%put &name2;  *  peter"'"s restaurant;

%let name2= %str(peter%str(%')s restaurant);
%put &name2;  *  peter's restaurant;

* error;


/*adt119.
in compressed dataset, REUSE=YES option,
allows new observations to be inserted wherever enough free space exists.*/



data adt144;
input category $ age salary bonus;
datalines;
M 28 200 20
M 25 100 10
M 28 300 10
M 33 300 30
F 18 100 50
F 25 200 10
F 35 400 50
;

run;


proc sql;
select age+1 as agePlus from adt144
where calculated agePlus > 10 /* success, where can use calculated.*/;
quit;

proc sql;
select avg(age) as ageGrp from adt144
where calculated ageGrp > 10 /* ERROR: Summary functions are restricted to the SELECT and HAVING clauses only..*/
group by category;
quit;

proc sql;
select category,avg(age) as avgAge from adt144
where calculated  avgAge > (select avg(age) from adt144)
/* ERROR: Summary functions are restricted to the SELECT and HAVING clauses only.*/
group by category ;
quit;

proc sql; /* correct : M 28.5 */
select category,avg(age) as avgAge from adt144
group by category
having avgAge > (select avg(age) from adt144);
quit;

proc sql outobs=15;
select * from work.adt144;
quit;

proc sql;
select category, sum(salary, bonus) as earnings
from adt144
group by category;
quit;

proc sql;
select category, sum(sum(salary, bonus)) as earnings
from adt144
group by category;
quit;

proc sql;
select distinct category, sum(sum(salary, bonus)) as earnings
from adt144
group by category;
quit;

proc sql;
select distinct category, sum(sum(salary, bonus)) as earnings
from adt144
where age < 30
group by category
having calculated earnings< 500;
quit;



proc sql;
select distinct category, age, sum(salary, bonus) as earnings
from adt144
where sum(salary, bonus) > 220   /* note warning: where caluse can only support non-????*/
group by category having avg(age) > 28
order by category desc;
;


proc sql;
select distinct category, age, sum(salary, bonus) as earnings
from adt144
where calculated earnings > 220
group by category having avg(age) > 28
order by category desc;
;
quit;


proc sql;
select distinct category, age, sum(salary, bonus) as earnings
from adt144
where bonus > 10
group by category having avg(age) > 28
order by category desc;
;
quit;


proc sql;
select distinct category, age, sum(salary, bonus) as earnings
from adt144
group by category having avg(age) > 28
order by category desc;
;
quit;

proc sql;
select distinct category
from adt144
group by category
order by category desc;
;
quit;


proc sql;
select distinct category, sum(salary, bonus) as earnings
from adt144
group by category
order by category desc;
;
quit;

proc sql;
select distinct category, sum(salary, bonus) as earnings
from adt144
where age  < 30
group by category;
quit;



proc sql;
update adt144
set salary = 200
where category = 'M' and bonus=10;
quit;


proc sql;
select category, age, avg(age) as avgAge, avg(salary+bonus) as income from adt144
group by category
order by calculated income desc;
quit;

proc sql;
select category, age, avg(age) as avgAge, avg(salary+bonus) as income from adt144
group by category
order by age desc;
quit;


proc sql;
select age, (bonus+salary) as income, sum(salary) as totalS from adt144
group by salary having salary >100 and salary <400
order by age desc;
quit;

proc sql;
select age, bonus from adt144
group by salary having salary >100 and salary <400
order by age desce;
quit;
proc sql;
select distinct category, salary
from adt144
where salary > 100
group by category having avg(age) > 20
order by category, 2;
quit;
/*
-------------
select * -> from *
where
group by having
order by
-------------
where is a constraint, where can not contain sum/min/max/avg/count,
first where , then ---> sum/count.

Having can have sum/min..., first sum... --> then having *
*/


/*adt153*/
%let var=chicago,1;
data a;
var = 'new york,2';
newvar = %scan(&var,2,%str());
run;

proc means data=orderline;
var totalprice;
title Today is: &sysdate.;
title Today is: &sysdate9.;
title Today is: &today.;
title Today is: %sysfunc(today(),worddate.);
title "Report Produced on %sysfunc(today(),worddate.)";
title Report Produced on %sysfunc(left(%sysfunc(today(),worddate.)));
run;

proc means data=orderline;
title Report Produced on %sysfunc(left(%sysfunc(today(),worddate.)));
/* Error:
The LEFT function expects only one argument, but you are passing 揓une 7, 2007?to
it. It interprets the comma as the delimiter between two arguments.
*/
run;

proc means data=orderline;
title "Report Produced on %sysfunc(left(%qsysfunc(today(),worddate.)))";
/* success */
run;

proc chart data=orderline;
hbar totalprice / sumvar=billdate;
run;

proc plot data=orderline;
plot numunits*billdate;
run;

/* Delimiters in Macro Variable Names */
%let prefix=g;
%let macroName = &prefix.plot;   /* use . connect a macro with a char */
%let dotName = &prefix..plot;
%put &macroName;
%put &dotName;



data macro1; /* macro in data step, macro processor only works once */
input control;
if _N_=1 then
do;
%let foot = foot1;
end;
if _N_=2 then
do;
%let foot = foot2;
end;
footStr = "&foot";
datalines;
1
2
3
4
;
run;
data macro1; /* macro in data step, macro processor only works once,
the value of foot will always be whatever the last %LET statement assigns */
input control;
if control=1 then
do;
%let foot = foot1;
end;
if control=2 then
do;
%let foot = foot2;
end;
footStr = "&foot";
datalines;
1
2
3
4
;
run;

proc print data= macro1;
run;
/*
The word scanner then continues to read the program and passes the second %LET
statement in the DATA step to the macro processor as well. This time, the macro
processor reassigns the value All Students Have Paid to foot in the symbol table.
When the RUN statement in the DATA step is encountered, SAS recognizes that the
step is complete, and executes it. Remember that at this point the DATA step no longer
includes any of the %LET statements (which have already been executed by the macro
processor). Because the %LET statements are always processed by the macro processor
before the DATA step is executed, the value of foot will always be whatever the last
%LET statement assigns.
*/
%let lib=sasuser;
libname &lib 'SAS-data-library';
proc &graphics.chart data=&lib.y&year&month;

%put %sysfunc(today(),worddate.);
%put "Today is: &sysdate.";
%put "Today is: &sysdate9.";

data testDate;
title %sysfunc(today(),worddate.);
a = 1;
* title "Today is: &sysdate.";
* title "Today is: &sysdate9.";
run;
proc print data=testdate;
run;

data testDate;
a = 1;
title "Today is: &sysdate.";
* title "Today is: &sysdate9.";
run;
proc print data=testdate;
run;

data testDate;
a = 1;
title "Today is: &sysdate9.";
run;
proc print data=testdate;
run;

************** ADV 50 practise****************************;
/*matrix[1:2,2] (1,2)*/
/*By admin | August 24, 2009*/


data ad50_1a;
input X A $;
datalines;
1 a
2 b
4 c
;
run;

data ad50_1b;
input X B $;
datalines;
2 x
3 y
5 v
;
run;
proc sql;  /* catersian product of tables */
select * from ad50_1a,ad50_1b;
quit;
proc sql;
select a.* , b.* from (ad50_1a as a right join ad50_1b as b on a.X ne 0);
quit;

proc sql;
select a.* , b.* from (ad50_1a as a right join ad50_1b as b on a.X=b.X);
quit;

data ad50a;
input year qtr budget;
datalines;
2001 3 500
2001 4 400
2002 1 700
;
run;
data ad50b;
input year qtr sales;
datalines;
2001 3 300
2001 4 600
2005 3 900
;
run;
proc sql;
select * from ad50a , ad50b;
quit;
proc sql;
select a.*, b.sales from ad50a as a left join ad50b as b on a.year=b.year and a.qtr=b.qtr;
quit;

proc sql;
select a.*, b.sales from ad50a as a right join ad50b as b on a.year=b.year;
quit;


proc sql;
select a.*, b.sales from ad50a as a join ad50b as b on a.year=b.year;
quit;

proc sql;
select a.*, b.sales from ad50a as a full join ad50b as b on a.year=b.year;
quit;


/*
set operator: Except|Intersect|Union|Outer Union|
Keywords:     ALL, Corr
----------
1. Except: select Unique rows from the first table that are not found in the
second table
2. Intersect: select unique rows that are common to both tables
3. Union: select unique rows from one or both tables
4. Outer Union: select all rows from both tables

ALL -- Makes only one pass through the data and does not remove duplicate rows

Corr -- Compares and overlays columns by name instead of by position;
when used with except, intersect and union, removes any columns that do not
have the same name in both tables; when used with outer union, overlays
same-named columns and displays columns that have non-matching names without
overlaying.
*/
/*
except all|corr
intersect all|corr
union all|corr
outer union all/corr
*/
/*union 1. intersect all|corr*/
/*if change some condition or add some filters*/
proc sql ;
select year, qtr from ad50a as a intersect select year, qtr from ad50b as b ;
quit;
proc sql ;
select year, qtr from ad50a as a intersect all select year, qtr from ad50b as b ;
quit;
proc sql ;
select year, qtr from ad50a as a intersect corr select year, qtr from ad50b as b ;
quit;
proc sql ;  /* corr: only choose the matched columns by name*/
select * from ad50a as a intersect corr select * from ad50b as b ;
quit;
***;
proc sql ; /* Note: no rows returned */
select * from ad50a as a intersect select * from ad50b as b ;
quit;

proc sql ;  /* no result */
select year, qtr from ad50a as a intersect  select * from ad50b as b ;
quit;
/*no row returned*/
proc sql ;
select * from ad50a as a intersect all select * from ad50b as b ;
quit;

/*FFFFFFFFFFFFF: when intersect used with corr option, then only show overlapped columns*/
proc sql ;
select * from ad50a as a intersect corr select * from ad50b as b ;
quit;

/*union 2. except all|corr*/
/*if change some condition or add some filters*/
proc sql ;
select year,qtr from ad50a as a except select year,qtr from ad50b as b;
quit;
proc sql ; /*  except all = except */
select year,qtr from ad50a as a except all select year,qtr from ad50b as b;
quit;

proc sql ; /*  except corr = except */
select year,qtr from ad50a as a except corr select year,qtr from ad50b as b;
quit;

/*select **/
proc sql ; /*  except corr : set operation on matched columns only  */
select * from ad50a as a except corr select  * from ad50b as b;
quit;

proc sql ; /*  except corr = except */
select * from ad50a as a except all select * from ad50b as b;
quit;

/*Q: if add condition*/

/*union 3. union all|corr: merge common and make unique */
proc sql;
select year, qtr from ad50a as a union
select year, qtr from ad50b as b;
quit;

proc sql;  /* can be used to select unique pattern */
select year from ad50a as a union
select year from ad50b as b;
quit;


proc sql;  /* do not remove duplicate, merge rows by position, regardless col name */
select  qtr, year from ad50a as a
union all
select year, qtr from ad50b as b;
/* = a + b, do not remove duplicate */
/*ALL - Makes only one pass through the data and does not remove duplicate rows*/
quit;


proc sql;
/*warning: A table has been extended with null columns to perform the UNION ALL set operation.*/
select  qtr from ad50a as a
union all
select year, qtr from ad50b as b;
quit;

proc sql;
select * from ad50a as a
union all
select *  from ad50b as b;
/* = a + b, do not remove duplicate */
/*ALL - Makes only one pass through the data and does not remove duplicate rows*/
quit;

proc sql;
select * from ad50a as a
union all corr
select *  from ad50b as b;
quit;

proc sql;
select year, qtr from ad50a as a union corr
select year, qtr from ad50b as b;
quit;

/*select * */
proc sql;
select * from ad50a as a union
select * from ad50b as b;
/* according to position, the label according to the first table */
quit;

proc sql;
select * from ad50b as b union
select * from ad50a as a;
/* union: according to position, the label according to the first table */
quit;

proc sql;
select * from ad50b as b union all  /* =a + b*/
select * from ad50a as a;
/* union all: according to position, the label according to the first table */
quit;
proc sql;
select * from ad50b as b union corr
select * from ad50a as a;
/* union corr: according to column name, the label according to the first table */
quit;

/*union 4. outer union all|corr*/
proc sql;
select year, qtr from ad50a outer union  /* the columns side-by-side */
select year, qtr from ad50b;
quit;
proc sql;
select * from ad50a outer union  /* put columns from both tables side-by-side */
select * from ad50b;
quit;


proc sql;
select * from ad50a outer union all
select * from ad50b;
/* not supported: The option or parameter is not recognized and will be ignored  */
/*outer union corr: the same column -- use all rows, the different column -- put side by side*/
quit;
proc sql;
select * from ad50a outer union corr
select * from ad50b;
/*outer union: the same column -- use all rows, the different column -- put side by side*/
quit;


Data ad50_4a;
input name $ course $;
datalines;
Lauren Math1
Patel Math1
Chang Math1
Chang Math1
Chang Math1
Patel .
Patel Mat/
Chang .
;
run;

Data ad50_4b;
input name $ course $;
datalines;
Smith Math2
Farmer Math2
Patel Math2
Hillier Math2
Patel .
Hillier .
;
run;

proc sql;
select * from ad50_4a where course like 'Mat%';
quit;

proc sql;
select * from ad50_4a where course like 'Mat/%';
/*no rows were selected*/
quit;


proc sql;
select count(*) from ad50_4a where course like 'Ma%';
quit;
proc sql;
select count(*) from ad50_4a ;/*count(*) include those with missing values*/
quit;


/*the equavilant of datastep and proc sql*/
data ad50_8;
set ad50_4a ad50_4b;
run;
proc print data = ad50_8;
run;
/*equals to proc sql outer union*/
proc sql ;
select * from ad50_4a outer union corr
select * from ad50_4b;
quit;

proc sql ;
select * from ad50_4a where name in ('Patel', 'Chang',) outer union corr ;
select * from ad50_4b where name in ('Patel', 'Chang');
quit;


proc sql ;
select * from ad50_4a where name in ('Patel', 'Chang',) union corr ;
select * from ad50_4b where name in ('Patel', 'Chang');
quit;


proc sql ;
select name from ad50_4a where name in ('Patel', 'Chang',) union corr
select name from ad50_4b where name in ('Patel', 'Chang');
quit;


proc sql ;
select name from ad50_4a where name in ('Patel', 'Chang',) outer union corr
select name from ad50_4b where name in ('Patel', 'Chang');
quit;


proc sql ;
select name from ad50_4a union corr select name from ad50_4b ;
quit;

proc sort data=ad50_4a out=ad50_4aa;
by name;
run;

proc sort data=ad50_4b out=ad50_4bb;
by name;
run;
data ad50_6a;
set ad50_4aa ad50_4bb;  /* output all rows, sort automatical by alphabet order.*/
by name;
run;
proc print data=ad50_6a;
run;

/*SAS in-live view*/
/*An in-line view is a nested query that is specified in the FROM clause.
An in-line view selects data from one or more tables to produce a temporary
in-memory table. This virtual table exists only during the query.
The main advantage of using an in-line view is to reduce the complexity of the code.
An in-line view is a SELECT statement within a SELECT statement, which we call a nested statement. Nested
SELECT statements sound complex but they are not.

In some cases, PROC SQL might be able to process the code more efficiently.

Unlike other queries, an in-line view cannot contain an ORDER BY clause.

PROC SQL;
CREATE TABLE AS MYTABLE
SELECT *
FROM (SELECT TABLE1.FIELD FROM LIBREF.TABLE1 WHERE SOME_CONDITION = TRUE;)
MYVIEW, LIBREF.TABLE2
WHERE MYVIEW.FIELD = TABLE2.FIELD;

In-line views are views in the FROM
clause. They are extended temporary tables which
are constructed by UNION, INTERSECT, EXCEPT,
and JOINs based on tables and subqueries. In-line
view can not correlate with its main query, but outer
(main) query can refer any values in the tables and
subqueries which constitute the in-line view.

*/

/*Compare append, set, outer union in efficiency*/
/*
SET statement in the DATA step
OUTER UNION CORR operator in the SQL procedure read all observations from all input data sets
Append : only read the appended dataset
Insert : only read the inserted dataset
*/


Data ad50_12a;
input name $ course $;
datalines;
Smith Math2
Farmer Math2
Patel Math2
Hillier Math2
;
run;

Data ad50_12b;
input name $ course $;
datalines;
Smith Math3
Farmer Math3
Patel Math3
Hillier Math5
Hiker Math6
;
run;
data smallset;
set ad50_12a;
run;

proc print data=smallset;
run;
proc sql; /*g 6-[ insert data from a in-line view */
insert into ad50_12a(name,course)
select name, course from ad50_12b;
select * from ad50_12a;
quit;

proc sql;
select name from ad50_12a where course in
(select distinct course from ad50_12a);  /* right */
quit;

proc sql;
select name from ad50_12a where course in
(select distinct course from ad50_12a order by course); /* error: an in-line view cannot contain an ORDER BY clause.*/
quit;

proc print data=ad50_12a;
run;

proc append base=ad50_12a data=smallset;
run;
proc sql;
select * from ad50_12a;
quit;

%let aName=Farmer;
proc sql;
select * from ad50_12a where name="&aName";
/*return correct values*/
quit;
proc sql;
select * from ad50_12a where name=&aName;
/*error: not found*/
quit;
/*
The INSERT INTO statement appends data to the Company.Employee table.
The ALTER TABLE statement (rather than the ALTER statement) enables you to add, drop,
and modify columns, but you cannot use it to append data sets.
The UPDATE statement enables you to modify values in existing rows, but you cannot
use it to append data sets.
The MODIFY clause works with the ALTER statement to enable you to change column
attributes, but you cannot use it to append data sets.
*/



/*
Key Practise
Join, Left Join, Right Join, Full Join:
select a.*, b.any.
Question: rows (not catasian product), columns from both table, dataset model,
Except, Intersect:
select common, comparable, can not include other column
Corr: only choose common column,
ALL, do not remove duplicate
Union: by position, Merge two dataset by row, column decide by select clause , and make Unique
union all:  do not remove duplicate, merge rows by position, regardless col name, col name decide by select clause
union all corr : merge rows, do not remove duplicate, compare the common columns
outer union(all): not supported
outer union (corr): put common columns first, then unmatched columns from both tables

SQL and macro:
*/

/* macro quoting functions: ****************************************;
%str(), %nrstr(), %quote(), %nrquote(), %bquote(), %nrbquote(), %superq()
and %unquote().
http://www.datasavantconsulting.com/roland/mquoting.html
*/

%let str=Roland's place; /* have problem , -->use %str(), %bquote(), %nrbquote()*/
%let str2=, Biberach';  /* have problem */
%let stra = %nrbquote(Roland's place, biberach);
%put &stra;  /*Roland's place, biberach*/

data _null_;
call symput('mvar',"Roland's place, Biberach"); /* have problem because single quote */
run;
%macro quotaa;
%let stra=%str(Roland, Place);  /* solve cooma */
%put &stra;
%let strb = %str(Rolands place, Biberach);  /* solve cooma */
%put &strb;
%let strc = %str(Roland%'s place, Biberach); /* put a % before ' */
%put &strc;
%let strb = %nrstr(Raland &Place, Bob%'s book);   /*Raland &Place, Bob's book*/
%put &strb;

%let a = 5;
%let b = %str(not good &a); /* not good 5*/
%put &b;
%let c =  %nrstr(not good &a); /* not good  &a*/
%put &c;

%quote;   /*obsolete functions*/
%nrquote ;  /*obsolete functions*/

%let d = %bquote(this is duke's book, and that) ; /* blind quote ,  used for dynamic quote*/
%put &d;  /*this is duke's book, and that*/
&d = &c; /* error: macro is fixed, static */
%let e = %bquote(this is duke's book, and &b);
%put &e ;  /*this is duke's book, and not good 5*/
%let e = %bquote(this is duke's book, and &e);  /* resolve &e */
%put &e ;  /*this is duke's book, and not good 5*/

%let f =  %nrbquote(this is duke's book, and &b);
%put &f;   /*this is duke's book, and not good 5*/
%let f =  %nrbquote(this is duke's book, and &f); /* resolve &f */
%put &f;   /*this is duke's book, and not good 5*/
%mend;
%quotaa;

data ad50_15;
x = 10;
y = '2';
c = x * y;
put c;  /* c = 20 : character converted to numeric value */
run;

proc sql;
select course into :courses separated by ' ' from ad50_12b;
quit;
data _null_;
cs = symget('courses');
put cs=;
array css {5} $ css1-css5;
do i=1 to 5;
css[i] = scan(cs, i, ' ');
put css[i];
end;
run;
%put &courses;
/*
yearcutoff: 1920-1999(20-99)-2019(00-19)
yearcutoff=1950: 1950-2049
1950-1999-2049
yearcutoff=1700: 1700-1799(00-99)
*/
options yearcutoff=2014;
data ad50_20;
options yearcutoff=2014;
date = '20Jul14'd; /*has nothing related to data value: 19924 */
put date;
run;
options yearcutoff=1915;

data ad50_20;
options yearcutoff=1910; * 1915-2015, 1910-2009, 1910-1999-2009;
date = '20Jul1914'd;  /*has nothing related to data value: -16601*/
put date;
run;

%let var=chicago's,1;  /* error */
%put &var;
%let var=chicago%'s,1;  /* correct */
%let var=chicago,1;  /* correct */
%put &var;
data ad50_22;
 var = 'new york,2';
newvar = %scan(&var,2,%bquote(,));
run;


/*ad50_23*/
%let idcode=Prod567;
%let codenum=%substr(&idcode,%length(&idcode)-2);
%let codenum=%substr(&idcode,length(&idcode)-2); /* error, length is a data step function*/
%put &codenum;

data _null_;
a = 'prod567';
b = length(a);
put a= b=;
run;


/*ad50_24*/
proc print data=ad50_22;
data _null_;
a = 19924;
b = put(a, date9.);
put b=;
run;
* title "The title is %sysfunc(putn(a, date9.))";
run;

%let firstGuy = Smith;
proc sql;
select * from ad50_12a where name=symget('firstGuy');
quit;

proc sql;
select * from ad50_12a where name="&firstGuy";
quit;

proc sql;
select * from ad50_12a where name=%symget('firstGuy');  /* error */
quit;

/*
The MPRINT option displays text that is sent to the compiler by a macro.
The MLOGIC option writes messages that trace the execution of macro statements but
does not write the text that is generated by the macro.
SOURCE2 writes source statements that are inserted by %INCLUDE statements.
MSOURCE is not a valid SAS option.
*/
/*29- recite one*/
proc sql;
create table ad50_30 as
select * from ad50_12a where name='smith';
quit;

proc append base=ad50_30 data=ad50_30 force; * combine merge --> not valid ;
run;

proc print data=ad50_30;
run;
proc print data=ad50_12a;
run;


/*ad50_31*/
/*
SAS format search sequence:
SAS internal format --> user defined format in Work.Formats catalog --> Library.Formats --> fmtsearch location list
options fmtsearch=(libA, libB...)
*/


/*merge: before merge should sort*/
data base_34_a;
input fname $ transactionAmt;
datalines;
Bruce 30
Dan 40
Kevin 50
Duke 40
Duke 80
Duke 90
Susan 39
;
run;

data base_34_b;
input fname $ saleamt;
datalines;
Bruce 25000
Bruce 35000
Bruce 75000
Dan 25000
Kevin 65000
Duke 45000
Adam 10000
;
run;

data base_34_1;
merge base_34_a base_34_b; **  blind merge, by record order;
run;
proc sql;
select * from base_34_1;
quit;

/*sort then merge*/
proc sort data=base_34_a out=base_34_a1;
by fname;
run;
proc sort data=base_34_b out=base_34_b1;
by fname;
run;
data base_34_1;
merge base_34_a1 base_34_b1; **  blind merge, by record order;
by fname;
run;
proc sql;
select * from base_34_1;
quit;
proc print data=base_34_1 uniform;
run;
/*
 fname              Amt   saleamt
------------PDV Re-Initialize new common items come -----------------------------
Adam                 .     10000
Bruce               30     25000
Bruce               30     35000
Bruce               30     75000
Dan                 40     25000
Duke                40     45000
Duke                80     45000
Duke                90     45000
Kevin               50     65000
Susan               39         .
*/

/*ad50_34*/
/*
Reducing the length of numeric variables that have either non-integer values or large
integer values results in (a loss of numeric precision).

Reduced-length numeric variables require (less I/O) to read because the data set that
contains them is smaller.

Reduced-length numeric variables must be expanded to 8 bytes when they are read; t
herefore, the CPU resources required to process them is increased.

Reduced-length variables are always stored in the program data vector in a length of 8.
*/

/*ad50_35*/
/*
according to compressed SAS data sets
To reuse deleted observation space, the REUSE=YES option must be specified when the
data set is compressed.
By default, the POINTOBS= option has a value of YES so that the observations are
addressable by observation number.
When the data sets are compressed, they are smaller and it therefore requires fewer
I/O operations to read them.
*/

/*Hash table*/
/*
hash object is memory based;
Hash object is created in a DATA step, and when the DATA step ends the hash object
disappears.
The data that are loaded into the hash object can be saved in a SAS dataset,
but the hash object itself is not saved. This is behavior similar to SAS arrays;
the columns which make up an array may be saved in a SAS dataset, but the array
disappears.

iterator : A hash object can have an associated iterator object. While the hash object actually stores your data in memory, the
associated iterator object is used to traverse the hash object ?either forward (i.e. from first to last record) or
backward (i.e. from last to first record).

The memory required for a hash object (and an associated iterator object) is allocated at run time; as you add or
remove items (records) from the hash object SAS will automatically allocate or release memory.

*/
proc contents data=orderline;
run;

data hashOrder;
set orderline(keep=orderid billdate totalprice);
run;

data hashSet;
length
orderid 8.
billdate 8.
totalprice 8.
;

declare hash hsod(); /* hash order */
key='orderid';
rc = hsod.defineKey(key);
rc = hsod.defineData('billdate','totalprice');
rc=  hsod.defineDone();
i = 1; j= 1;
put i= j=;
do while (not EOF);
 set hashOrder end=EOF;
 rc = hsod.add(); /* catch return code */
 put rc=;
 if rc NE 0 then /* if success, then rc = 0 */
do;
j=j+1; *put "Problem with .add()" ;
end;
 else
i=i+1;
put i=;
if i>5 then stop;
put 's';
 end;
stop;
put i=; put j=;
put 'hash records=' i= 'Failure Record=' j=;
run;

/*
if _n_ = 0 then set data.feecodes;
This removes a potential maintenance issue in addition to cutting down on the amount of typing we need to do. Since
_N_ will never equal zero (0), the statement is not executed and no records from the dataset will be read; however,
the SAS compiler will read the metadata (in particular the column names and attributes) from the dataset and add the
columns to the program data vector (PDV). Now if feecode changes from four bytes to six our programme
automatically knows; one maintenance issue resolved before it happens.

Prior to SAS 9.2, there was no way to have multiple items with the same key;

LARGE HASH TABLES: DECLARE hash patients(DATASET:'data.patients', HASHEXP: 20);
The hash object data items and key item can be character, numeric, or a combination of
both.
*/
data hash_2;
declare hashX(dataset: table (rename=(CPEVENT=VISIT)));
run;

data hash_3;
length patient $5 trtgrp $30;
if _N_ = 1 then do;
declare hash eg;
eg = _new_ hash();
/* declare hash eg();*/
rc = eg.defineKey('patient');
rc = eg.defineData('trtgrp');
rc = eg.defineDone();
/* get rid of warning if variable not initialized
The CALL MISSING routine assigns a missing value to
the specified character or numeric variables.
*/
call missing(patient, trtgrp);
end;
do i=1 to 2000;
rc = eg.add(key: 'Y1', data: 'Treatment cone. A');
rc = eg.add(key: 'X2', data: 'Treatment cone. B');
rc = eg.add(key: 'Y2', data: 'Control');
rc = eg.add(key: 'Y2', data: 'Control');
eg.output(dataset:'hash5New');
end;
/* find data using key */
do i=1 to 12;
rc = eg.find(key: 'X2');
if (rc = 0) then put trtgrp=;
else put 'Key not found';

rc = eg.find(key: 'Y1');
if (rc = 0) then put trtgrp=;
else put 'Key not found';
end;

run;



data hash_3a;
length patientId 5. trtgrp $30;
if _N_ = 1 then do;
declare hash eg;
eg = _new_ hash();
/* declare hash eg();*/
rc = eg.defineKey('patientId');
rc = eg.defineData('trtgrp');
rc = eg.defineDone();
call missing(patientId, trtgrp);
end;
do i=1 to 2000000;
rc = eg.add(key:i, data: 'Treatment cone. A');
end;
eg.output(dataset:'hash5New');
run;

/*Hash table
The key component
?? might consist of numeric and character values
?? maps key values to data rows
?? must be unique
?? can be a composite.
The data component
?? can contain multiple data values per key value
?? can consist of numeric and character values.
*/
/*read a dataset and put it into hashtable for fast processing*/
data hash_3b;
length patientId 5. trtgrp $30;
set hash_3a end=eof;
/*if  _N_ =  0 then set hash_3a;*/
if _N_ = 1 then
do;
declare hash ptHash();
ptHash.defineKey('patientId');
ptHash.defineData('trtgrp');
ptHash.defineDone();
end;
/*set hash_3a end=eof;*/
if (eof eq 0) then
do;
rc = ptHash.add(key:patientId, data: trtgrp);
* rc = ptHash.add();
end;
else
ptHash.output(dataset:'hash4bNew');
run;

data hash_4;
length patient $5 trtgrp $30;
do i=1 to 20000000;
patient = 'Y1';
trtgrp = 'Treatment Cone. A';
output;
end;
run;

data hash_5;
length patient $7 trtgrp $30 trtstdt $9 trtendt $9;
if _N_ = 1 then do;
declare hash eg;
eg = _new_ hash();
/* declare hash eg();*/
rc = eg.defineKey('patient');
rc = eg.defineData('trtgrp','trtstdt', 'trtendt');
rc = eg.defineDone();
do i=1 to 20000;
keyStr = 'X';  * || i;
rc = eg.add(key:keyStr, data:'Treatment A', data:'13Feb2012',data:'21Mar2012');
if (rc = 0) then ;
else ; * put 'output error';
end;
eg.output(dataset:'hash5New');
end;
run;


data hash7a;
input IDnum Jobcode $ Salary Bonus;
cards;
1400 ME1 29769 587
1403 ME1 28072 342
1120 ME1 28619 986
1653 ME2 35108 987
;
run;

/*define hash table from an existed dataset*/
data hash7;
declare hash newH(dataset: "hash7a");
if _n_=1 then
do;
set hash7a(keep=IDnum Jobcode Salary Bonus) end=eof;
newH.definekey("IDnum");
* newH.defineData(All, "Yes"); /* use all columns */
newH.defineData("Jobcode"); /* use all columns */
newH.DefineDone();
end;
newH.add(key:IDnum, data:JobCode);
if (_N_=4) then
do;
rc = newH.find(key:idNum, data:"1400");
newH.output(dataset: 'HashNew');
put rc;
end;
run;

/*
real time           53.11 seconds
cpu time            7.91 seconds
Hash objects are also very good for data summarization  and can typically execute the job up
to  twice as fast while utilizing a third of the memory when compared with data step programming
*/



/*ad50_40*/
/* When is the input of I/O measured?
this is the I/O time, means input output, not processing time, just input and
output, so once data from disk load into memory, then finish, other operations
in memory such as observations append/delete/logic/business... is CPU time, not
I/O time, like raw data converted to variables.
*/

data ad50_42;
array t{4,3} (1,2,3,4,5,6,7,8,9,10,11,12); /* array{rows, cols}*/
put t{3,2}=; /* it is not 3*2, but it is  2*3+2 = 8 , (row-1)*colsPerRow + col_id */
put t(1,1) = ; * (1-1)*3 + 1 = 1;
put t(3,3) =; * (3-1)*3+3 = 9;
put t(4,3) = ; * (4-1)*3 + 3 = 12;
run;
/*
123
456
789
101112
*/

/*proc sql: Where works more efficient on : If the key variable has few unique values,
typically a large subset is retrieved when a WHERE statement references the
key variable;*/

/*
the SORTEDBY= option in data set
It requires that SAS check the order of the data as it processes it.
When SAS processes the data set created with the SORTEDBY= option,
SAS must validate the order as the data is processed.
*/

data ad2_3(sortedby=patid); /* add a sorted flag to a sorted dataset, require SAS
validate the order when process data */
input patid @@;
cards;
110 109 111
;
run;

/*
For large dataset, first sort data by group, then create index, then reduce
I/O operation during retrieve data
*/

*************************************ADV 63*************************************;
Data ad63_1;
input name $ course $ score;
datalines;
Lauren Math1 90
Patel Math1 89
Chang Math1 77
Chang Math1 88
Chang Math1 89
Patel Math2 90
Patel Math2 98
Chang Math2 87
Patel Math3 90
Patel Math3 98
Chang Math3 87
;
run;

proc means data=ad63_1;
class course;
var score;
where score > 90;
run;

proc sql;
create index score on ad63_1;
quit;

proc sql;
select * from ad63_1
where course in (select course from ad63_1 where course='Math1' or course='Math2');
quit;

proc sql;
select * from ad63_1 where score > all (select score from ad63_1 where name like 'C%');
quit;


proc sql;
select * from ad63_1 where score < all (select score from ad63_1 where name like 'P%');
quit;

/*
x = ANY (...)": The value must match one or more values in the list to evaluate to TRUE.
x != ANY (...)": The value must not match one or more values in the list to evaluate to TRUE.
x > ANY (...)": The value must be greater than the smallest value in the list to evaluate to TRUE.
x < ANY (...)": The value must be smaller than the biggest value in the list to evaluate to TRUE.
x >= ANY (...)": The value must be greater than or equal to the smallest value in the list to evaluate to TRUE.
x <= ANY (...)": The value must be smaller than or equal to the biggest value in the list to evaluate to TRUE.
*/

/* SOME

The SOME and ANY comparison conditions do exactly the same thing and are completely
interchangeable*/

proc sql;
select * from ad63_1 where score = any(select score from ad63_1 where score > 80);
quit;

/*
x = ALL (...)": The value must match all the values in the list to evaluate to TRUE.
x != ALL (...)": The value must not match any values in the list to evaluate to TRUE.
x > ALL (...)": The value must be greater than the biggest value in the list to evaluate to TRUE.
x < ALL (...)": The value must be smaller than the smallest value in the list to evaluate to TRUE.
x >= ALL (...)": The value must be greater than or equal to the biggest value in the list to evaluate to TRUE.
x <= ALL (...)": The value must be smaller than or equal to the smallest value in the list to evaluate to TRUE.
*/
proc sql;
select * from ad63_1 where score < any (select score from ad63_1 where name like 'P%');
quit;

proc sql;
select * from ad63_1 where score between 80 and 95;
quit;

proc sql;  /* ERROR: CONTAINS operator requires character operands */
select * from ad63_1 where score contains 87 or 88 or 89 ;
quit;

proc sql;
describe table ad63_1;
quit;
proc sql;
select * from ad63_1 where name contains 'Chang';
quit;

proc sql;
select * from ad63_1 where course contains 'Mat';
quit;


proc sql;
select * from ad63_1 where course ? 'Mat';  /*  ? = contains */
quit;


proc sql;
select * from ad63_1 where course ? 'Math1' or course ? 'Math2';  /*  correct */
quit;

/*?????????????? convert num to char */
proc sql;
select * from ad63_1 where score contains 87;
/*ERROR: CONTAINS operator requires character operands*/
quit;


proc sql;
select * from ad63_1 where score in (90,89,98,120);
quit;


proc sql; /*  =* means sounds like */
select * from ad63_1 where name =* 'Patal';  /* Petal sounds like Patal */
quit;

proc sql; /*  =* means sounds like */
select * from ad63_1 where exists (select * from ad63_1 where score > 88);
quit;


/*
between and, contains or ?, in
is missing, is null
like
=* ------ value sound like a value
any, all, exists
*/

data ad63_31a;
input X $ Y;
datalines;
A 10
A 3
A 14
B 9
;
run;
data ad63_31b;
input SumY;
datalines ;
36
;
run;
data ad63;
if _n_ = 1 then set ad63_31b;
set ad63_31a;
run;

data ad63;
set ad63_31b;
set ad63_31a;
run;

data ad63;
if _n_ = 1 then set ad63_31a;
set ad63_31b;
run;

proc print data= ad63;
run;

data ad63;
if _n_ = 2 then set ad63_31b;
set ad63_31a;
run;

proc print data= ad63;
run;

data ad63_48;
infile datalines;
input Name $ Birthday :mmddyy10.;
datalines;
Alan 11/15/1950
Barb 08/23/1966
;
run;
%let Want=23Aug1966;
Proc print data=ad63_48;
where Birthday="&Want"d;
run;



Proc print data=ad63_48;
where Birthday='&Want'd;
/*error: invalid date time constant, treat as char*/
run;

/*usage of filename and file*/
filename myhtml "&prjpath\procreg.html";
ods html body=myhtml;
proc reg data=base_39;
  model write= female math;
run;
quit;
ods html close;


filename emailout email
TO = "duke.vicsas@gmail.com"
subject = "Test email by Duke - Sent by SAS 9.3"
from = "duke.fcbk@gmail.com" importance = "high" ;

data _null_;
file emailout; /* as input */
put "filename emailout email";
put "Test email by Duke - Sent by SAS 9.3 - ";
put "This email was sent by SAS code as below:";
put "TO = 'duke.vicsas@gmail.com'";
put "subject = 'Test email by Duke - Sent by SAS 9.3'";
put "from = 'duke.fcbk@gmail.com' importance = 'high'; ";
put "run;";
run;


filename myhtml "&prjpath\procreg.html";
filename emailout email TO = "" subject = "" from = "";

data _null_;
file emailout; /* as input */
put "something";  /*normally from dataset*/
run;

data _null_;
file "&prjpath\newFile.sas";
put "somthing";
run;


%macro MP_ONE(pname=means);
proc &pname data=ad63_48;
run;
%mend;
%MP_ONE(pname=print);   /* run correct */
%MP_ONE(print);   /* not correct format, parameters not in correct order */
%MP_ONE();
proc means data=ad63_48;
run;
proc print data=ad63_48;
run;


data ad63_31;
input X $ Y Name $ Birth;
datalines;
A 10 Johon 97
A 3 Mike 98
A 14 Duke 89
B 9 Line 898
;
run;

%macro varGrp1;
  X Y;
%mend;

%macro varGrp2;
  Name Birth;
%mend;
proc print data=ad63_31;
var X Y %varGrp2;
run;

proc print data=ad63_48;
var Birthday %varGrp1 ;
run;

%macro start;
%put ---------> xxx;
%mend;
%start;

data ad_matrix;
array multi{1:2,2} (1,2);
do i=1 to 2;
do j=1 to 2;
put multi[i,j]=;
end;
end;
put multi[1,1]=  multi[1,2]=  multi[2,1]=  multi[2,2]=;
run;

proc print data=ad_matrix;
run;
data ad_matrix;
array mul{1:2,2} (1,2);  /* mul[1,2] not equal to mul[2,1] */
/* mul[1,1]=mul1 = 1, mul[1,2]=mul2=2, mul[2,1]=mul3, mul[2,2]=mmul4 */
array mulx{2,2} (1,2);  /* init according to array id: 1,2,3,4,5.... */
array muly{2:3,2} (1,2,3);   /* rowId=(rowEnd-rowStat), arrayId = (rowId-1)*colsPerRow */
put muly[3,1]=; /* = 3 */
run;
proc print data=ad_matrix;
run;




data ad50_42;
array t{4,3} (1,2,3,4,5,6,7,8,9,10,11,12); /* array{rows, cols}*/
put t{3,2}=; /* it is not 3*2, but it is  2*3+2 = 8 , (row-1)*colsPerRow + col_id */
put t(1,1) = ; * (1-1)*3 + 1 = 1;
put t(3,3) =; * (3-1)*3+3 = 9;
put t(4,3) = ; * (4-1)*3 + 3 = 12;
run;


%put &SYSSCP;  /* result: Win */




PROC SQL;
CONNECT TO ODBC as myODBC (DSN=MyDSN USER=linux_user PASSWORD=sqlpwd);
CREATE TABLE recent_CA as
SELECT * FROM CONNECTION TO myODBC
(SELECT f.*
FROM fruit as f inner join vCA as c
on f.fruit=c.item
WHERE date>&recent
ORDER BY f.fruit, f.price, f.qty
);
QUIT;


LIBNAME mylibref ODBC NOPROMPT="UID=;PWD=;DSN=winsas-dsn;SERVER=lanssql\lansql;DATABASE=Car;" ;
PROC PRINT DATA = mylibref.tbTestDbCon(obs=10);


libname carlib sqlsvr
   noprompt="uid=hp7;
   pwd=red2hat;
   dsn=winsas_dsn;"
   stringdates=yes;

proc print data=mydblib.customers;
   where state='CA';  
run;


LIBNAME mSQL ODBC NOPROMPT="UID=hp7;PWD=red2hat;DSN=winsas_dsn" ;

LIBNAME mSQL ODBC uid='hp7' pwd='red2hat' DSN='winsas_dsn' ; * schema=MSSQLTips;

LIBNAME mSQL ODBC uid='hp7' pwd='red2hat' DSN='winsas_dsn' ; * schema=MSSQLTips;
PROC CONTENTS DATA = mSQL.tbTestDbCon;
run;
PROC PRINT DATA = mSQL.tbTestDbCon;
RUN;

proc setinit;
run;



data _null_;
set ad63_31 nobs=j;
file 'a.txt';
header='.........';
footer='..........';
if _n_=1 then do;
put header;
end;
put X Y Name Birth ;
if _n_=j then do;
put footer;
end;
run;

No comments:

Post a Comment