Principal component analysis; PCA
NLP Embedding 인가?
위와 같이 상관계수 행렬이 주어졌을 때 주성분 분석
data 입력
data Q2; input y1 - y19; cards; 1.000 .934 .927 .909 .524 .799 .854 .789 .835 .845 -.458 .917 .939 .953 .895 .691 .327 -.676 .702 .934 1.000 .941 .944 .487 .821 .865 .834 .863 .878 -.496 .942 .961 .954 .899 .652 .305 -.712 .729 .927 .941 1.000 .933 .543 .856 .886 .846 .862 .863 -.522 .940 .956 .946 .882 .694 .356 -.667 .746 .909 .944 .933 1.000 .499 .833 .889 .885 .850 .881 -.488 .945 .952 .949 .908 .623 .272 -.736 .777 .524 .487 .543 .499 1.000 .703 .719 .253 .462 .567 -.174 .516 .494 .452 .551 .815 .746 -.233 .285 .799 .821 .856 .833 .703 1.000 .923 .699 .752 .836 -.317 .846 .849 .823 .831 .812 .553 -.504 .499 .854 .865 .886 .889 .719 .923 1.000 .751 .793 .913 -.383 .907 .914 .886 .891 .855 .567 -.502 .592 .789 .834 .846 .885 .253 .699 .751 1.000 .745 .787 -.497 .861 .876 .878 .794 .410 .067 -.758 .793 .835 .863 .862 .850 .462 .752 .793 .745 1.000 .805 -.356 .848 .877 .883 .818 .620 .300 -.666 .671 .845 .878 .863 .881 .567 .836 .913 .787 .805 1.000 -.371 .902 .901 .891 .848 .712 .384 -.629 .668 -.458 -.496 -.522 -.488 -.174 -.317 -.383 -.497 -.356 -.371 1.000 -.465 -.447 -.439 -.405 -.198 -.032 .492 -.425 .917 .942 .940 .945 .516 .846 .907 .861 .848 .902 -.465 1.000 .981 .971 .908 .725 .396 -.657 .696 .939 .961 .956 .952 .494 .849 .914 .876 .877 .901 -.447 .981 1.000 .991 .920 .714 .360 -.655 .724 .953 .954 .946 .949 .452 .823 .886 .878 .883 .891 -.439 .971 .991 1.000 .921 .676 .298 -.678 .731 .895 .899 .882 .908 .551 .831 .891 .794 .818 .848 -.405 .908 .920 .921 1.000 .720 .378 -.633 .694 .691 .652 .694 .623 .815 .812 .855 .410 .620 .712 -.198 .725 .714 .676 .720 1.000 .781 -.186 .287 .327 .305 .356 .272 .746 .553 .567 .067 .300 .384 -.032 .396 .360 .298 .378 .781 1.000 .169 .026 -.676 -.712 -.667 -.736 -.233 -.504 -.502 -.758 -.666 -.629 .492 -.657 -.655 -.678 -.633 -.186 .169 1.000 -.775 .702 .729 .746 .777 .285 .499 .592 .793 .671 .668 -.425 .696 .724 .731 .694 .287 .026 -.775 1.000 ; run;
고유치들로만 으로 이루어진 행렬 만들기
proc iml ; use Q2; read all into A; print A; T = T(A); print T; test = T - A; print test; val = eigval(A); vec = eigvec(A); print val; print vec; create values; append var {val}; close values; run; quit;
고유치를 이용하여 누적률 계산하기
proc means data = values sum; run; /*19.0000000*/ data h; set values; percent = val / 19; retain cumul 0; cumul = cumul + percent; keep percent cumul val; proc print; run;
data 입력
data Q4; input y1 - y11; cards; 84 65 147 85 59 151 95 40 398 273 30 84 65 149 86 61 159 94 28 345 140 34 79 66 142 83 64 152 94 41 368 318 33 81 67 147 83 65 158 94 50 406 282 26 84 68 167 88 69 180 93 46 379 311 41 74 66 131 77 67 147 96 73 478 446 4 73 66 131 78 69 159 96 72 462 294 5 75 67 134 84 68 159 95 70 464 313 20 84 68 161 89 71 195 95 63 430 455 31 86 72 169 91 76 206 93 56 406 604 36 88 73 176 91 76 206 94 55 393 610 43 90 74 187 94 76 211 94 51 385 520 47 88 72 171 94 75 211 96 54 405 663 45 58 72 171 92 70 201 95 51 392 467 45 81 69 154 87 68 167 95 61 448 184 11 79 68 149 83 68 162 95 59 436 177 10 84 69 160 87 66 173 95 42 392 173 30 84 70 160 87 68 177 94 44 392 76 29 84 70 168 88 70 169 95 48 396 72 23 77 67 147 83 66 170 97 60 431 183 16 87 67 166 92 67 196 96 44 379 76 37 89 69 171 92 72 199 94 48 393 230 50 89 72 180 94 72 204 95 48 394 193 36 93 72 186 92 73 201 94 47 386 400 54 93 74 188 93 72 206 95 47 389 339 44 94 75 199 94 72 208 96 45 370 172 41 93 74 193 95 73 214 95 50 396 238 45 93 74 196 95 70 210 96 45 380 118 42 96 75 198 95 71 207 93 40 365 93 50 95 76 202 95 69 202 93 39 357 269 48 84 73 173 96 69 173 94 58 418 128 17 91 71 170 91 69 168 94 44 420 423 20 88 72 179 89 70 189 93 50 399 415 15 89 72 179 95 71 210 98 46 389 300 42 91 72 182 96 73 208 95 43 384 193 44 92 74 196 97 75 215 96 46 389 195 41 94 75 192 96 69 198 95 36 380 215 49 96 75 195 95 67 196 97 24 354 185 53 93 76 198 94 75 211 93 43 364 466 53 88 74 188 92 73 198 95 52 405 399 21 88 74 178 90 74 197 95 61 447 232 1 91 72 175 94 70 205 94 42 380 275 44 92 72 190 95 71 209 96 44 379 166 44 92 73 189 96 72 208 93 42 372 189 46 94 75 194 95 71 208 93 43 373 164 47 96 76 202 96 71 208 94 40 368 139 50 ; run;
주성분 분석
공분산으로 하고 싶은면proc princomp data = Q4 out = cov_out covariance; var y1 -- y11; run;
상관계수로 하고 싶으면 ( defalut가 corr )
proc princomp data = Q4 out = cov_out; var y1 -- y11; run;
정규성 검토
그 대우명제 이용 하는거proc univariate data = cov_out; var prin1 - prin4; qqplot prin1; qqplot prin2; qqplot prin3; qqplot prin4; run;