Data: StateMilk • Chart ID: GeoChartID26f01b803502 • googleVis-0.6.0
R version 3.3.1 (2016-06-21) • Google Terms of Use • Documentation and Data Policy
R version 3.3.1 (2016-06-21) • Google Terms of Use • Documentation and Data Policy
duration | src_bytes | dst_bytes | land | wrong_fragment | urgent | hot | num_failed_logins | logged_in | num_compromised | root_shell | su_attempted | num_root | num_file_creations | num_shells | num_access_files | num_outbound_cmds | is_hot_login | is_guest_login | count | srv_count | serror_rate | srv_serror_rate | rerror_rate | srv_rerror_rate | same_srv_rate | diff_srv_rate | srv_diff_host_rate | dst_host_count | dst_host_srv_count | dst_host_same_srv_rate | dst_host_diff_srv_rate | dst_host_same_src_port_rate | dst_host_srv_diff_host_rate | dst_host_serror_rate | dst_host_srv_serror_rate | dst_host_rerror_rate | dst_host_srv_rerror_rate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
duration | 1.000000 | 0.014196 | 0.299189 | -0.001068 | -0.008025 | 0.017883 | 0.108639 | 0.014363 | 0.159564 | 0.010687 | 0.040425 | 0.026015 | 0.013401 | 0.061099 | 0.008632 | 0.019407 | -0.000019 | -0.000010 | 0.205606 | -0.259032 | -0.250139 | -0.074211 | -0.073663 | -0.025936 | -0.026420 | 0.062291 | -0.050875 | 0.123621 | -0.161107 | -0.217167 | -0.211979 | 0.231644 | -0.065202 | 0.100692 | -0.056753 | -0.057298 | -0.007759 | -0.013891 |
src_bytes | 0.014196 | 1.000000 | -0.167931 | -0.009404 | -0.019358 | 0.000094 | 0.113920 | -0.008396 | -0.089702 | 0.118562 | 0.003067 | 0.002282 | -0.002050 | 0.027710 | 0.014403 | -0.001497 | 0.000010 | 0.000019 | 0.027511 | 0.666230 | 0.722609 | -0.657460 | -0.652391 | -0.342180 | -0.332977 | 0.744046 | -0.739988 | -0.104042 | 0.130377 | 0.741979 | 0.729151 | -0.712965 | 0.815039 | -0.140231 | -0.645920 | -0.641792 | -0.297338 | -0.300581 |
dst_bytes | 0.299189 | -0.167931 | 1.000000 | -0.003040 | -0.022659 | 0.007234 | 0.193156 | 0.021952 | 0.882185 | 0.169772 | 0.026054 | 0.012192 | -0.003884 | 0.034154 | -0.000054 | 0.065776 | -0.000031 | 0.000041 | 0.085947 | -0.639157 | -0.497683 | -0.205848 | -0.198715 | -0.100958 | -0.081307 | 0.229677 | -0.222572 | 0.521003 | -0.611972 | 0.024124 | 0.055033 | -0.035073 | -0.396195 | 0.578557 | -0.167047 | -0.158378 | -0.003042 | 0.001621 |
land | -0.001068 | -0.009404 | -0.003040 | 1.000000 | -0.000333 | -0.000065 | -0.000539 | -0.000076 | -0.002785 | -0.000447 | -0.000093 | -0.000049 | -0.000230 | -0.000150 | -0.000076 | -0.000211 | -0.002881 | 0.002089 | -0.000250 | -0.010939 | -0.010128 | 0.014160 | 0.014342 | -0.000451 | -0.001690 | 0.002153 | -0.001846 | 0.020678 | -0.019923 | -0.012341 | 0.002576 | -0.001803 | 0.004265 | 0.016171 | 0.013566 | 0.012265 | 0.000389 | -0.001816 |
wrong_fragment | -0.008025 | -0.019358 | -0.022659 | -0.000333 | 1.000000 | -0.000150 | -0.004042 | -0.000568 | -0.020911 | -0.003370 | -0.000528 | -0.000248 | -0.001727 | -0.001160 | -0.000507 | -0.001519 | -0.000147 | 0.000441 | -0.001869 | -0.057711 | -0.029117 | -0.008849 | -0.023382 | 0.000430 | -0.012676 | 0.010218 | -0.009386 | 0.012117 | -0.029149 | -0.058225 | -0.049560 | 0.055542 | -0.015449 | 0.007306 | 0.010387 | -0.024117 | 0.046656 | -0.013666 |
urgent | 0.017883 | 0.000094 | 0.007234 | -0.000065 | -0.000150 | 1.000000 | 0.008594 | 0.063009 | 0.006821 | 0.031765 | 0.067437 | 0.000020 | 0.061994 | 0.061383 | -0.000066 | 0.023380 | 0.012879 | 0.005162 | -0.000100 | -0.004778 | -0.004799 | -0.001338 | -0.001327 | -0.000705 | -0.000726 | 0.001521 | -0.001522 | -0.000788 | -0.005894 | -0.005698 | -0.004078 | 0.005208 | -0.001939 | -0.000976 | -0.001381 | -0.001370 | -0.000786 | -0.000782 |
hot | 0.108639 | 0.113920 | 0.193156 | -0.000539 | -0.004042 | 0.008594 | 1.000000 | 0.112560 | 0.189126 | 0.811529 | 0.101983 | -0.000400 | 0.003096 | 0.028694 | 0.009146 | 0.004224 | -0.000393 | -0.000248 | 0.463706 | -0.120847 | -0.114735 | -0.035487 | -0.034934 | 0.013468 | 0.052003 | 0.041342 | -0.040555 | 0.032141 | -0.074178 | -0.017960 | 0.018783 | -0.017198 | -0.086998 | -0.014141 | -0.004706 | -0.010721 | 0.199019 | 0.189142 |
num_failed_logins | 0.014363 | -0.008396 | 0.021952 | -0.000076 | -0.000568 | 0.063009 | 0.112560 | 1.000000 | -0.002190 | 0.004619 | 0.016895 | 0.072748 | 0.010060 | 0.015211 | -0.000093 | 0.005581 | 0.003431 | -0.001560 | -0.000428 | -0.018024 | -0.018027 | -0.003674 | -0.004027 | 0.035324 | 0.034876 | 0.005716 | -0.005538 | -0.003096 | -0.028369 | -0.015092 | 0.003004 | -0.002960 | -0.006617 | -0.002588 | 0.014713 | 0.014914 | 0.032395 | 0.032151 |
logged_in | 0.159564 | -0.089702 | 0.882185 | -0.002785 | -0.020911 | 0.006821 | 0.189126 | -0.002190 | 1.000000 | 0.161190 | 0.025293 | 0.011813 | 0.082533 | 0.055530 | 0.024354 | 0.072698 | 0.000079 | 0.000127 | 0.089318 | -0.578287 | -0.438947 | -0.187114 | -0.180122 | -0.091962 | -0.072287 | 0.216969 | -0.214019 | 0.503807 | -0.682721 | 0.080352 | 0.114526 | -0.093565 | -0.359506 | 0.659078 | -0.143283 | -0.132474 | 0.007236 | 0.012979 |
num_compromised | 0.010687 | 0.118562 | 0.169772 | -0.000447 | -0.003370 | 0.031765 | 0.811529 | 0.004619 | 0.161190 | 1.000000 | 0.085558 | 0.048985 | 0.028557 | 0.031223 | 0.011256 | 0.006977 | 0.001048 | -0.000438 | -0.002504 | -0.097212 | -0.091154 | -0.030516 | -0.030264 | 0.008573 | 0.054006 | 0.035253 | -0.034953 | 0.036497 | -0.041615 | 0.003465 | 0.038980 | -0.039091 | -0.078843 | -0.020979 | -0.005019 | -0.004504 | 0.214115 | 0.217858 |
root_shell | 0.040425 | 0.003067 | 0.026054 | -0.000093 | -0.000528 | 0.067437 | 0.101983 | 0.016895 | 0.025293 | 0.085558 | 1.000000 | 0.233486 | 0.094512 | 0.140650 | 0.132056 | 0.069353 | 0.011462 | -0.006602 | -0.000405 | -0.016409 | -0.015174 | -0.004952 | -0.004923 | -0.001104 | -0.001143 | 0.004946 | -0.004553 | 0.002286 | -0.021367 | -0.011906 | 0.000515 | -0.000916 | -0.004617 | 0.008631 | -0.003498 | -0.003032 | 0.002763 | 0.002151 |
su_attempted | 0.026015 | 0.002282 | 0.012192 | -0.000049 | -0.000248 | 0.000020 | -0.000400 | 0.072748 | 0.011813 | 0.048985 | 0.233486 | 1.000000 | 0.119326 | 0.053110 | 0.040487 | 0.081272 | -0.018896 | 0.012927 | -0.000219 | -0.008279 | -0.008225 | -0.002318 | -0.002295 | -0.001227 | -0.001253 | 0.002634 | -0.002649 | 0.000348 | -0.006697 | -0.006288 | -0.005738 | 0.006687 | -0.005020 | 0.001052 | 0.001974 | 0.002893 | 0.003173 | 0.001731 |
num_root | 0.013401 | -0.002050 | -0.003884 | -0.000230 | -0.001727 | 0.061994 | 0.003096 | 0.010060 | 0.082533 | 0.028557 | 0.094512 | 0.119326 | 1.000000 | 0.047521 | 0.034405 | 0.014513 | 0.001524 | -0.002585 | -0.001281 | -0.054721 | -0.053530 | -0.016031 | -0.015936 | -0.008610 | -0.008708 | 0.013881 | -0.011337 | 0.006316 | -0.078717 | -0.038689 | -0.038935 | 0.047414 | -0.015968 | 0.061030 | -0.008457 | -0.007096 | -0.000421 | -0.005012 |
num_file_creations | 0.061099 | 0.027710 | 0.034154 | -0.000150 | -0.001160 | 0.061383 | 0.028694 | 0.015211 | 0.055530 | 0.031223 | 0.140650 | 0.053110 | 0.047521 | 1.000000 | 0.068660 | 0.031042 | -0.004081 | -0.001664 | 0.013242 | -0.036467 | -0.034598 | -0.009703 | -0.010390 | -0.005069 | -0.004775 | 0.009784 | -0.008711 | 0.014412 | -0.049529 | -0.026890 | -0.021731 | 0.027092 | -0.015018 | 0.030590 | -0.002257 | -0.004295 | 0.000626 | -0.001096 |
num_shells | 0.008632 | 0.014403 | -0.000054 | -0.000076 | -0.000507 | -0.000066 | 0.009146 | -0.000093 | 0.024354 | 0.011256 | 0.132056 | 0.040487 | 0.034405 | 0.068660 | 1.000000 | 0.019438 | -0.002592 | -0.006631 | -0.000405 | -0.013938 | -0.011784 | -0.004343 | -0.004740 | -0.002541 | -0.002572 | 0.004282 | -0.003743 | 0.001096 | -0.021200 | -0.012017 | -0.009962 | 0.010761 | -0.003521 | 0.015882 | -0.001588 | -0.002357 | -0.000617 | -0.002020 |
num_access_files | 0.019407 | -0.001497 | 0.065776 | -0.000211 | -0.001519 | 0.023380 | 0.004224 | 0.005581 | 0.072698 | 0.006977 | 0.069353 | 0.081272 | 0.014513 | 0.031042 | 0.019438 | 1.000000 | -0.001597 | -0.002850 | 0.002466 | -0.045282 | -0.040497 | -0.013945 | -0.013572 | -0.007581 | 0.001874 | 0.015499 | -0.015112 | 0.024266 | -0.023865 | -0.023657 | -0.021358 | 0.026703 | -0.033288 | 0.011765 | -0.011197 | -0.011487 | -0.004743 | -0.004552 |
num_outbound_cmds | -0.000019 | 0.000010 | -0.000031 | -0.002881 | -0.000147 | 0.012879 | -0.000393 | 0.003431 | 0.000079 | 0.001048 | 0.011462 | -0.018896 | 0.001524 | -0.004081 | -0.002592 | -0.001597 | 1.000000 | 0.822890 | 0.000924 | -0.000076 | 0.000100 | 0.000167 | 0.000209 | 0.000536 | 0.000346 | 0.000208 | 0.000328 | -0.000141 | -0.000424 | -0.000280 | -0.000503 | -0.000181 | -0.000455 | 0.000288 | -0.000011 | -0.000372 | -0.000823 | -0.001038 |
is_hot_login | -0.000010 | 0.000019 | 0.000041 | 0.002089 | 0.000441 | 0.005162 | -0.000248 | -0.001560 | 0.000127 | -0.000438 | -0.006602 | 0.012927 | -0.002585 | -0.001664 | -0.006631 | -0.002850 | 0.822890 | 1.000000 | 0.001512 | 0.000036 | 0.000064 | 0.000102 | -0.000302 | -0.000550 | 0.000457 | -0.000159 | -0.000235 | -0.000360 | -0.000106 | 0.000206 | 0.000229 | -0.000004 | 0.000283 | 0.000538 | -0.000076 | -0.000007 | -0.000435 | -0.000529 |
is_guest_login | 0.205606 | 0.027511 | 0.085947 | -0.000250 | -0.001869 | -0.000100 | 0.463706 | -0.000428 | 0.089318 | -0.002504 | -0.000405 | -0.000219 | -0.001281 | 0.013242 | -0.000405 | 0.002466 | 0.000924 | 0.001512 | 1.000000 | -0.062340 | -0.062713 | -0.017343 | -0.017240 | -0.008867 | -0.009193 | 0.018042 | -0.017000 | -0.008878 | -0.055453 | -0.044366 | -0.041749 | 0.044640 | -0.038092 | -0.012578 | -0.001066 | -0.016885 | 0.025282 | -0.004292 |
count | -0.259032 | 0.666230 | -0.639157 | -0.010939 | -0.057711 | -0.004778 | -0.120847 | -0.018024 | -0.578287 | -0.097212 | -0.016409 | -0.008279 | -0.054721 | -0.036467 | -0.013938 | -0.045282 | -0.000076 | 0.000036 | -0.062340 | 1.000000 | 0.950587 | -0.303538 | -0.308923 | -0.213824 | -0.221352 | 0.346718 | -0.361737 | -0.384010 | 0.547443 | 0.586979 | 0.539698 | -0.546869 | 0.776906 | -0.496554 | -0.331571 | -0.335290 | -0.261194 | -0.256176 |
srv_count | -0.250139 | 0.722609 | -0.497683 | -0.010128 | -0.029117 | -0.004799 | -0.114735 | -0.018027 | -0.438947 | -0.091154 | -0.015174 | -0.008225 | -0.053530 | -0.034598 | -0.011784 | -0.040497 | 0.000100 | 0.000064 | -0.062713 | 0.950587 | 1.000000 | -0.428185 | -0.421424 | -0.281468 | -0.284034 | 0.517227 | -0.511998 | -0.239057 | 0.442611 | 0.720746 | 0.681955 | -0.673916 | 0.812280 | -0.391712 | -0.449096 | -0.442823 | -0.313442 | -0.308132 |
serror_rate | -0.074211 | -0.657460 | -0.205848 | 0.014160 | -0.008849 | -0.001338 | -0.035487 | -0.003674 | -0.187114 | -0.030516 | -0.004952 | -0.002318 | -0.016031 | -0.009703 | -0.004343 | -0.013945 | 0.000167 | 0.000102 | -0.017343 | -0.303538 | -0.428185 | 1.000000 | 0.990888 | -0.091157 | -0.095285 | -0.851915 | 0.828012 | -0.121489 | 0.165350 | -0.724317 | -0.745745 | 0.719708 | -0.650336 | -0.153568 | 0.973947 | 0.965663 | -0.103198 | -0.105434 |
srv_serror_rate | -0.073663 | -0.652391 | -0.198715 | 0.014342 | -0.023382 | -0.001327 | -0.034934 | -0.004027 | -0.180122 | -0.030264 | -0.004923 | -0.002295 | -0.015936 | -0.010390 | -0.004740 | -0.013572 | 0.000209 | -0.000302 | -0.017240 | -0.308923 | -0.421424 | 0.990888 | 1.000000 | -0.110664 | -0.115286 | -0.839315 | 0.815305 | -0.112222 | 0.160322 | -0.713313 | -0.734334 | 0.707753 | -0.646256 | -0.148072 | 0.967214 | 0.970617 | -0.122630 | -0.124656 |
rerror_rate | -0.025936 | -0.342180 | -0.100958 | -0.000451 | 0.000430 | -0.000705 | 0.013468 | 0.035324 | -0.091962 | 0.008573 | -0.001104 | -0.001227 | -0.008610 | -0.005069 | -0.002541 | -0.007581 | 0.000536 | -0.000550 | -0.008867 | -0.213824 | -0.281468 | -0.091157 | -0.110664 | 1.000000 | 0.978813 | -0.327986 | 0.345571 | -0.017902 | -0.067857 | -0.330391 | -0.303126 | 0.308722 | -0.278465 | 0.073061 | -0.094076 | -0.110646 | 0.910225 | 0.911622 |
srv_rerror_rate | -0.026420 | -0.332977 | -0.081307 | -0.001690 | -0.012676 | -0.000726 | 0.052003 | 0.034876 | -0.072287 | 0.054006 | -0.001143 | -0.001253 | -0.008708 | -0.004775 | -0.002572 | 0.001874 | 0.000346 | 0.000457 | -0.009193 | -0.221352 | -0.284034 | -0.095285 | -0.115286 | 0.978813 | 1.000000 | -0.316568 | 0.333439 | 0.011285 | -0.072595 | -0.323032 | -0.294328 | 0.300186 | -0.282239 | 0.075178 | -0.096146 | -0.114341 | 0.904591 | 0.914904 |
same_srv_rate | 0.062291 | 0.744046 | 0.229677 | 0.002153 | 0.010218 | 0.001521 | 0.041342 | 0.005716 | 0.216969 | 0.035253 | 0.004946 | 0.002634 | 0.013881 | 0.009784 | 0.004282 | 0.015499 | 0.000208 | -0.000159 | 0.018042 | 0.346718 | 0.517227 | -0.851915 | -0.839315 | -0.327986 | -0.316568 | 1.000000 | -0.982109 | 0.140660 | -0.190121 | 0.848754 | 0.873551 | -0.844537 | 0.732841 | 0.179040 | -0.830067 | -0.819335 | -0.282487 | -0.282913 |
diff_srv_rate | -0.050875 | -0.739988 | -0.222572 | -0.001846 | -0.009386 | -0.001522 | -0.040555 | -0.005538 | -0.214019 | -0.034953 | -0.004553 | -0.002649 | -0.011337 | -0.008711 | -0.003743 | -0.015112 | 0.000328 | -0.000235 | -0.017000 | -0.361737 | -0.511998 | 0.828012 | 0.815305 | 0.345571 | 0.333439 | -0.982109 | 1.000000 | -0.138293 | 0.185942 | -0.844028 | -0.868580 | 0.850911 | -0.727031 | -0.176930 | 0.807205 | 0.795844 | 0.299041 | 0.298904 |
srv_diff_host_rate | 0.123621 | -0.104042 | 0.521003 | 0.020678 | 0.012117 | -0.000788 | 0.032141 | -0.003096 | 0.503807 | 0.036497 | 0.002286 | 0.000348 | 0.006316 | 0.014412 | 0.001096 | 0.024266 | -0.000141 | -0.000360 | -0.008878 | -0.384010 | -0.239057 | -0.121489 | -0.112222 | -0.017902 | 0.011285 | 0.140660 | -0.138293 | 1.000000 | -0.445051 | 0.035010 | 0.068648 | -0.050472 | -0.222707 | 0.433173 | -0.097973 | -0.092661 | 0.022585 | 0.024722 |
dst_host_count | -0.161107 | 0.130377 | -0.611972 | -0.019923 | -0.029149 | -0.005894 | -0.074178 | -0.028369 | -0.682721 | -0.041615 | -0.021367 | -0.006697 | -0.078717 | -0.049529 | -0.021200 | -0.023865 | -0.000424 | -0.000106 | -0.055453 | 0.547443 | 0.442611 | 0.165350 | 0.160322 | -0.067857 | -0.072595 | -0.190121 | 0.185942 | -0.445051 | 1.000000 | 0.022731 | -0.070448 | 0.044338 | 0.189876 | -0.918894 | 0.123881 | 0.113845 | -0.125142 | -0.125273 |
dst_host_srv_count | -0.217167 | 0.741979 | 0.024124 | -0.012341 | -0.058225 | -0.005698 | -0.017960 | -0.015092 | 0.080352 | 0.003465 | -0.011906 | -0.006288 | -0.038689 | -0.026890 | -0.012017 | -0.023657 | -0.000280 | 0.000206 | -0.044366 | 0.586979 | 0.720746 | -0.724317 | -0.713313 | -0.330391 | -0.323032 | 0.848754 | -0.844028 | 0.035010 | 0.022731 | 1.000000 | 0.970072 | -0.955178 | 0.769481 | 0.043668 | -0.722607 | -0.708392 | -0.312040 | -0.300787 |
dst_host_same_srv_rate | -0.211979 | 0.729151 | 0.055033 | 0.002576 | -0.049560 | -0.004078 | 0.018783 | 0.003004 | 0.114526 | 0.038980 | 0.000515 | -0.005738 | -0.038935 | -0.021731 | -0.009962 | -0.021358 | -0.000503 | 0.000229 | -0.041749 | 0.539698 | 0.681955 | -0.745745 | -0.734334 | -0.303126 | -0.294328 | 0.873551 | -0.868580 | 0.068648 | -0.070448 | 0.970072 | 1.000000 | -0.980245 | 0.771158 | 0.107926 | -0.742045 | -0.725272 | -0.278068 | -0.264383 |
dst_host_diff_srv_rate | 0.231644 | -0.712965 | -0.035073 | -0.001803 | 0.055542 | 0.005208 | -0.017198 | -0.002960 | -0.093565 | -0.039091 | -0.000916 | 0.006687 | 0.047414 | 0.027092 | 0.010761 | 0.026703 | -0.000181 | -0.000004 | 0.044640 | -0.546869 | -0.673916 | 0.719708 | 0.707753 | 0.308722 | 0.300186 | -0.844537 | 0.850911 | -0.050472 | 0.044338 | -0.955178 | -0.980245 | 1.000000 | -0.766402 | -0.088665 | 0.719275 | 0.701149 | 0.287476 | 0.271067 |
dst_host_same_src_port_rate | -0.065202 | 0.815039 | -0.396195 | 0.004265 | -0.015449 | -0.001939 | -0.086998 | -0.006617 | -0.359506 | -0.078843 | -0.004617 | -0.005020 | -0.015968 | -0.015018 | -0.003521 | -0.033288 | -0.000455 | 0.000283 | -0.038092 | 0.776906 | 0.812280 | -0.650336 | -0.646256 | -0.278465 | -0.282239 | 0.732841 | -0.727031 | -0.222707 | 0.189876 | 0.769481 | 0.771158 | -0.766402 | 1.000000 | -0.175310 | -0.658737 | -0.652636 | -0.299273 | -0.297100 |
dst_host_srv_diff_host_rate | 0.100692 | -0.140231 | 0.578557 | 0.016171 | 0.007306 | -0.000976 | -0.014141 | -0.002588 | 0.659078 | -0.020979 | 0.008631 | 0.001052 | 0.061030 | 0.030590 | 0.015882 | 0.011765 | 0.000288 | 0.000538 | -0.012578 | -0.496554 | -0.391712 | -0.153568 | -0.148072 | 0.073061 | 0.075178 | 0.179040 | -0.176930 | 0.433173 | -0.918894 | 0.043668 | 0.107926 | -0.088665 | -0.175310 | 1.000000 | -0.118697 | -0.103715 | 0.114971 | 0.120767 |
dst_host_serror_rate | -0.056753 | -0.645920 | -0.167047 | 0.013566 | 0.010387 | -0.001381 | -0.004706 | 0.014713 | -0.143283 | -0.005019 | -0.003498 | 0.001974 | -0.008457 | -0.002257 | -0.001588 | -0.011197 | -0.000011 | -0.000076 | -0.001066 | -0.331571 | -0.449096 | 0.973947 | 0.967214 | -0.094076 | -0.096146 | -0.830067 | 0.807205 | -0.097973 | 0.123881 | -0.722607 | -0.742045 | 0.719275 | -0.658737 | -0.118697 | 1.000000 | 0.968015 | -0.087531 | -0.096899 |
dst_host_srv_serror_rate | -0.057298 | -0.641792 | -0.158378 | 0.012265 | -0.024117 | -0.001370 | -0.010721 | 0.014914 | -0.132474 | -0.004504 | -0.003032 | 0.002893 | -0.007096 | -0.004295 | -0.002357 | -0.011487 | -0.000372 | -0.000007 | -0.016885 | -0.335290 | -0.442823 | 0.965663 | 0.970617 | -0.110646 | -0.114341 | -0.819335 | 0.795844 | -0.092661 | 0.113845 | -0.708392 | -0.725272 | 0.701149 | -0.652636 | -0.103715 | 0.968015 | 1.000000 | -0.111578 | -0.110532 |
dst_host_rerror_rate | -0.007759 | -0.297338 | -0.003042 | 0.000389 | 0.046656 | -0.000786 | 0.199019 | 0.032395 | 0.007236 | 0.214115 | 0.002763 | 0.003173 | -0.000421 | 0.000626 | -0.000617 | -0.004743 | -0.000823 | -0.000435 | 0.025282 | -0.261194 | -0.313442 | -0.103198 | -0.122630 | 0.910225 | 0.904591 | -0.282487 | 0.299041 | 0.022585 | -0.125142 | -0.312040 | -0.278068 | 0.287476 | -0.299273 | 0.114971 | -0.087531 | -0.111578 | 1.000000 | 0.950964 |
dst_host_srv_rerror_rate | -0.013891 | -0.300581 | 0.001621 | -0.001816 | -0.013666 | -0.000782 | 0.189142 | 0.032151 | 0.012979 | 0.217858 | 0.002151 | 0.001731 | -0.005012 | -0.001096 | -0.002020 | -0.004552 | -0.001038 | -0.000529 | -0.004292 | -0.256176 | -0.308132 | -0.105434 | -0.124656 | 0.911622 | 0.914904 | -0.282913 | 0.298904 | 0.024722 | -0.125273 | -0.300787 | -0.264383 | 0.271067 | -0.297100 | 0.120767 | -0.096899 | -0.110532 | 0.950964 | 1.000000 |
src_bytes | dst_bytes | hot | logged_in | num_compromised | num_outbound_cmds | is_hot_login | count | srv_count | serror_rate | srv_serror_rate | rerror_rate | srv_rerror_rate | same_srv_rate | diff_srv_rate | dst_host_count | dst_host_srv_count | dst_host_same_srv_rate | dst_host_diff_srv_rate | dst_host_same_src_port_rate | dst_host_srv_diff_host_rate | dst_host_serror_rate | dst_host_srv_serror_rate | dst_host_rerror_rate | dst_host_srv_rerror_rate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
src_bytes | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False |
dst_bytes | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
hot | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
logged_in | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
num_compromised | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
num_outbound_cmds | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
is_hot_login | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
count | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
srv_count | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False |
serror_rate | False | False | False | False | False | False | False | False | False | False | True | False | False | True | True | False | False | False | False | False | False | True | True | False | False |
srv_serror_rate | False | False | False | False | False | False | False | False | False | True | False | False | False | True | True | False | False | False | False | False | False | True | True | False | False |
rerror_rate | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | True | True |
srv_rerror_rate | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | True | True |
same_srv_rate | False | False | False | False | False | False | False | False | False | True | True | False | False | False | True | False | True | True | True | False | False | True | True | False | False |
diff_srv_rate | False | False | False | False | False | False | False | False | False | True | True | False | False | True | False | False | True | True | True | False | False | True | False | False | False |
dst_host_count | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False |
dst_host_srv_count | False | False | False | False | False | False | False | False | False | False | False | False | False | True | True | False | False | True | True | False | False | False | False | False | False |
dst_host_same_srv_rate | False | False | False | False | False | False | False | False | False | False | False | False | False | True | True | False | True | False | True | False | False | False | False | False | False |
dst_host_diff_srv_rate | False | False | False | False | False | False | False | False | False | False | False | False | False | True | True | False | True | True | False | False | False | False | False | False | False |
dst_host_same_src_port_rate | True | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
dst_host_srv_diff_host_rate | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False |
dst_host_serror_rate | False | False | False | False | False | False | False | False | False | True | True | False | False | True | True | False | False | False | False | False | False | False | True | False | False |
dst_host_srv_serror_rate | False | False | False | False | False | False | False | False | False | True | True | False | False | True | False | False | False | False | False | False | False | True | False | False | False |
dst_host_rerror_rate | False | False | False | False | False | False | False | False | False | False | False | True | True | False | False | False | False | False | False | False | False | False | False | False | True |
dst_host_srv_rerror_rate | False | False | False | False | False | False | False | False | False | False | False | True | True | False | False | False | False | False | False | False | False | False | False | True | False |
dst_host_same_src_port_rate
references the percentage of the last 100 connections to the same port, for the same destination host. In our correlation matrix (and auxiliar dataframes) we find that this one is highly and positively correlated to src_bytes
and srv_count
. The former is the number of bytes sent form source to destination. The later is the number of connections to the same service as the current connection in the past 2 seconds. We might decide not to include dst_host_same_src_port_rate
in our model if we include the other two, as a way to reduce the number of variables and later one better interpret our models.from pyspark import SparkContext
sc =SparkContext()
data_file = "/home/osboxes/Python with Spark - part 1/pydata/kddcup.data.gz"
raw_data = sc.textFile(data_file)
print "Train data size is {}".format(raw_data.count())
Train data size is 4898431
ft = urllib.urlretrieve("http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz", "corrected.gz")
test_data_file = "/home/osboxes/Python with Spark - part 1/pydata/corrected.gz"
test_raw_data = sc.textFile(test_data_file)
print "Test data size is {}".format(test_raw_data.count())
Test data size is 311029
from pyspark.mllib.regression import LabeledPoint
from numpy import array
def parse_interaction(line):
line_split = line.split(",")
# leave_out = [1,2,3,41]
clean_line_split = line_split[0:1]+line_split[4:41]
attack = 1.0
if line_split[41]=='normal.':
attack = 0.0
return LabeledPoint(attack, array([float(x) for x in clean_line_split]))
training_data = raw_data.map(parse_interaction)
test_data = test_raw_data.map(parse_interaction)
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from time import time
# Build the model
t0 = time()
logit_model = LogisticRegressionWithLBFGS.train(training_data)
tt = time() - t0
print "Classifier trained in {} seconds".format(round(tt,3))
Classifier trained in 519.229 seconds
map
on the test_data
RDD and the model to predict each test point class.labels_and_preds = test_data.map(lambda p: (p.label, logit_model.predict(p.features)))
filter
and count
as follows.t0 = time()
test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data.count())
tt = time() - t0
print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4))
Prediction made in 20.933 seconds. Test accuracy is 0.9164
dst_host_same_src_port_rate
references the percentage of the last 100 connections to the same port, for the same destination host. In our correlation matrix (and auxiliary dataframes) we find that this one is highly and positively correlated to src_bytes
and srv_count
. The former is the number of bytes sent form source to destination. The later is the number of connections to the same service as the current connection in the past 2 seconds. We decide not to include dst_host_same_src_port_rate
in our model since we include the other two.serror_rate
and srv_error_rate
(% of connections that have SYN errors for same host and same service respectively) are highly positively correlated. Moreover, the set of variables that they highly correlate with are pretty much the same. They look like contributing very similarly to our model. We will keep just serror_rate
.rerror_rate
and srv_rerror_rate
(% of connections that have REJ errors) so we will keep just rerror_rate
.dst_host_
for the previous ones (e.g. dst_host_srv_serror_rate
).same_srv_rate
and diff_srv_rate
are good candidates. Our list of variables we will drop includes:dst_host_same_src_port_rate
, (column 35).srv_serror_rate
(column 25).srv_rerror_rate
(column 27).dst_host_srv_serror_rate
(column 38).dst_host_srv_rerror_rate
(column 40).def parse_interaction_corr(line):
line_split = line.split(",")
# leave_out = [1,2,3,25,27,35,38,40,41]
clean_line_split = line_split[0:1]+line_split[4:25]+line_split[26:27]+line_split[28:35]+line_split[36:38]+line_split[39:40]
attack = 1.0
if line_split[41]=='normal.':
attack = 0.0
return LabeledPoint(attack, array([float(x) for x in clean_line_split]))
corr_reduced_training_data = raw_data.map(parse_interaction_corr)
corr_reduced_test_data = test_raw_data.map(parse_interaction_corr)
leave_out
list for filtering is more Pythonic than slicing and concatenation indeed, but we have found it less efficient. This is very important when dealing with large datasets. The parse_interaction
functions will be called for every element in the RDD, so we need to make them as efficient as possible.# Build the model
t0 = time()
logit_model_2 = LogisticRegressionWithLBFGS.train(corr_reduced_training_data)
tt = time() - t0
print "Classifier trained in {} seconds".format(round(tt,3))
Classifier trained in 595.322 seconds
labels_and_preds = corr_reduced_test_data.map(lambda p: (p.label, logit_model_2.predict(p.features)))
t0 = time()
test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(corr_reduced_test_data.count())
tt = time() - t0
print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4))
Prediction made in 20.872 seconds. Test accuracy is 0.8599
Vector
, whereas the independence test requires a Matrix
as input. Moreover, MLlib also supports the input type RDD[LabeledPoint]
to enable feature selection via chi-squared independence tests. Again, these methods are part of the Statistics
package.LabeledPoint
. Internally, MLlib will calculate a contingency matrix and perform the Persons's chi-squared (χ2) test. Features need to be categorical. Real-valued features will be treated as categorical in each of its different values. There is a limit of 1000 different values, so we need either to leave out some features or categorise them. In this case, we will consider just features that either take boolean values or just a few different numeric values in our dataset. We could overcome this limitation by defining a more complex parse_interaction
function that categorises each feature properly.feature_names = ["land","wrong_fragment",
"urgent","hot","num_failed_logins","logged_in","num_compromised",
"root_shell","su_attempted","num_root","num_file_creations",
"num_shells","num_access_files","num_outbound_cmds",
"is_hot_login","is_guest_login","count","srv_count","serror_rate",
"srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
"diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
"dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
"dst_host_rerror_rate","dst_host_srv_rerror_rate"]
def parse_interaction_categorical(line):
line_split = line.split(",")
clean_line_split = line_split[6:41]
attack = 1.0
if line_split[41]=='normal.':
attack = 0.0
return LabeledPoint(attack, array([float(x) for x in clean_line_split]))
training_data_categorical = raw_data.map(parse_interaction_categorical)
from pyspark.mllib.stat import Statistics
chi = Statistics.chiSqTest(training_data_categorical)
import pandas as pd
pd.set_option('display.max_colwidth', 30)
records = [(result.statistic, result.pValue) for result in chi]
chi_df = pd.DataFrame(data=records, index= feature_names, columns=["Statistic","p-value"])
chi_df
Statistic | p-value | |
---|---|---|
land | 4.649835e-01 | 4.953041e-01 |
wrong_fragment | 3.068555e+02 | 0.000000e+00 |
urgent | 3.871844e+01 | 2.705761e-07 |
hot | 1.946331e+04 | 0.000000e+00 |
num_failed_logins | 1.277691e+02 | 0.000000e+00 |
logged_in | 3.273098e+06 | 0.000000e+00 |
num_compromised | 2.011863e+03 | 0.000000e+00 |
root_shell | 1.044918e+03 | 0.000000e+00 |
su_attempted | 4.340000e+02 | 0.000000e+00 |
num_root | 2.287168e+04 | 0.000000e+00 |
num_file_creations | 9.179739e+03 | 0.000000e+00 |
num_shells | 1.380028e+03 | 0.000000e+00 |
num_access_files | 1.873477e+04 | 0.000000e+00 |
num_outbound_cmds | 0.000000e+00 | 1.000000e+00 |
is_hot_login | 8.070987e+00 | 4.497960e-03 |
is_guest_login | 1.350051e+04 | 0.000000e+00 |
count | 4.546398e+06 | 0.000000e+00 |
srv_count | 2.296060e+06 | 0.000000e+00 |
serror_rate | 2.684199e+05 | 0.000000e+00 |
srv_serror_rate | 3.026270e+05 | 0.000000e+00 |
rerror_rate | 9.860453e+03 | 0.000000e+00 |
srv_rerror_rate | 3.247639e+04 | 0.000000e+00 |
same_srv_rate | 3.999124e+05 | 0.000000e+00 |
diff_srv_rate | 3.909998e+05 | 0.000000e+00 |
srv_diff_host_rate | 1.365459e+06 | 0.000000e+00 |
dst_host_count | 2.520479e+06 | 0.000000e+00 |
dst_host_srv_count | 1.439086e+06 | 0.000000e+00 |
dst_host_same_srv_rate | 1.237932e+06 | 0.000000e+00 |
dst_host_diff_srv_rate | 1.339002e+06 | 0.000000e+00 |
dst_host_same_src_port_rate | 2.915195e+06 | 0.000000e+00 |
dst_host_srv_diff_host_rate | 2.226291e+06 | 0.000000e+00 |
dst_host_serror_rate | 4.074546e+05 | 0.000000e+00 |
dst_host_srv_serror_rate | 4.550990e+05 | 0.000000e+00 |
dst_host_rerror_rate | 1.364790e+05 | 0.000000e+00 |
dst_host_srv_rerror_rate | 2.545474e+05 | 0.000000e+00 |
land
and num_outbound_cmds
could be removed from our model without affecting our accuracy dramatically. Let's try this.parse_interaction
function will be to remove columns 6 and 19, corresponding to the two predictors that we want not to be part of our model.def parse_interaction_chi(line):
line_split = line.split(",")
# leave_out = [1,2,3,6,19,41]
clean_line_split = line_split[0:1] + line_split[4:6] + line_split[7:19] + line_split[20:41]
attack = 1.0
if line_split[41]=='normal.':
attack = 0.0
return LabeledPoint(attack, array([float(x) for x in clean_line_split]))
training_data_chi = raw_data.map(parse_interaction_chi)
test_data_chi = test_raw_data.map(parse_interaction_chi)
# Build the model
t0 = time()
logit_model_chi = LogisticRegressionWithLBFGS.train(training_data_chi)
tt = time() - t0
print "Classifier trained in {} seconds".format(round(tt,3))
Classifier trained in 518.452 seconds
labels_and_preds = test_data_chi.map(lambda p: (p.label, logit_model_chi.predict(p.features)))
t0 = time()
test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data_chi.count())
tt = time() - t0
print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4))
Prediction made in 21.726 seconds. Test accuracy is 0.9164
from pyspark import SparkContext
sc =SparkContext()
import urllib
f = urllib.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz", "kddcup.data.gz")
data_file = "/home/osboxes/Python with Spark - part 1/pydata/kddcup.data.gz"
raw_data = sc.textFile(data_file)
print "Train data size is {}".format(raw_data.count())
Train data size is 4898431
ft = urllib.urlretrieve("http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz", "corrected.gz")
test_data_file = "/home/osboxes/Python with Spark - part 1/pydata/corrected.gz"
test_raw_data = sc.textFile(test_data_file)
print "Test data size is {}".format(test_raw_data.count())
Test data size is 311029
normal
or attack
.from pyspark.mllib.regression import LabeledPoint
from numpy import array
csv_data = raw_data.map(lambda x: x.split(","))
test_csv_data = test_raw_data.map(lambda x: x.split(","))
protocols = csv_data.map(lambda x: x[1]).distinct().collect()
services = csv_data.map(lambda x: x[2]).distinct().collect()
flags = csv_data.map(lambda x: x[3]).distinct().collect()
create_labeled_point
function. If a factor level is not in the training data, we assign an especial level. Remember that we cannot use testing data for training our model, not even the factor levels. The testing data represents the unknown to us in a real case.def create_labeled_point(line_split):
# leave_out = [41]
clean_line_split = line_split[0:41]
# convert protocol to numeric categorical variable
try:
clean_line_split[1] = protocols.index(clean_line_split[1])
except:
clean_line_split[1] = len(protocols)
# convert service to numeric categorical variable
try:
clean_line_split[2] = services.index(clean_line_split[2])
except:
clean_line_split[2] = len(services)
# convert flag to numeric categorical variable
try:
clean_line_split[3] = flags.index(clean_line_split[3])
except:
clean_line_split[3] = len(flags)
# convert label to binary label
attack = 1.0
if line_split[41]=='normal.':
attack = 0.0
return LabeledPoint(attack, array([float(x) for x in clean_line_split]))
training_data = csv_data.map(create_labeled_point)
test_data = test_csv_data.map(create_labeled_point)
maxDepth
value small. This will lead to smaller accuracy, but we will obtain less splits so later on we can better interpret the tree. In a production system we will try to increase this value in order to find a better accuracy.from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from time import time
# Build the model
t0 = time()
tree_model = DecisionTree.trainClassifier(training_data, numClasses=2,
categoricalFeaturesInfo={1: len(protocols), 2: len(services), 3: len(flags)},
impurity='gini', maxDepth=4, maxBins=100)
tt = time() - t0
print "Classifier trained in {} seconds".format(round(tt,3))
Classifier trained in 302.6 seconds
map
on the test_data
RDD and the model to predict each test point class.predictions = tree_model.predict(test_data.map(lambda p: p.features))
labels_and_preds = test_data.map(lambda p: p.label).zip(predictions)
filter
and count
as follows.t0 = time()
test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data.count())
tt = time() - t0
print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4))
Prediction made in 32.387 seconds. Test accuracy is 0.918
toDebugString
method in our three model we can obtain a lot of information regarding splits, nodes, etc.print "Learned classification tree model:"
print tree_model.toDebugString()
Learned classification tree model: DecisionTreeModel classifier of depth 4 with 29 nodes If (feature 22 <= 74.0) If (feature 25 <= 0.6) If (feature 36 <= 0.43) If (feature 34 <= 0.91) Predict: 0.0 Else (feature 34 > 0.91) Predict: 1.0 Else (feature 36 > 0.43) If (feature 2 in {0.0,3.0,15.0,26.0,36.0,67.0,27.0,18.0,4.0,7.0,20.0,24.0,43.0,44.0,46.0,47.0,55.0,57.0,58.0,60.0,42.0}) Predict: 0.0 Else (feature 2 not in {0.0,3.0,15.0,26.0,36.0,67.0,27.0,18.0,4.0,7.0,20.0,24.0,43.0,44.0,46.0,47.0,55.0,57.0,58.0,60.0,42.0}) Predict: 1.0 Else (feature 25 > 0.6) If (feature 3 in {7.0,4.0,9.0,2.0,3.0,10.0}) If (feature 2 in {3.0,5.0,7.0,8.0,15.0,18.0,50.0,51.0,67.0,12.0,27.0,42.0,58.0,68.0}) Predict: 0.0 Else (feature 2 not in {3.0,5.0,7.0,8.0,15.0,18.0,50.0,51.0,67.0,12.0,27.0,42.0,58.0,68.0}) Predict: 1.0 Else (feature 3 not in {7.0,4.0,9.0,2.0,3.0,10.0}) If (feature 38 <= 0.06) Predict: 0.0 Else (feature 38 > 0.06) Predict: 1.0 Else (feature 22 > 74.0) If (feature 5 <= 0.0) If (feature 11 <= 0.0) If (feature 31 <= 254.0) Predict: 1.0 Else (feature 31 > 254.0) Predict: 1.0 Else (feature 11 > 0.0) If (feature 2 in {12.0}) Predict: 0.0 Else (feature 2 not in {12.0}) Predict: 1.0 Else (feature 5 > 0.0) If (feature 29 <= 0.08) If (feature 4 <= 28.0) Predict: 1.0 Else (feature 4 > 28.0) Predict: 0.0 Else (feature 29 > 0.08) Predict: 1.0
count
, the number of connections to the same host as the current connection in the past two seconds, being greater than 32.dst_bytes
, the number of data bytes from destination to source, is 0.service
is neither level 0 nor 52.logged_in
is false.print "Service 0 is {}".format(services[0])
print "Service 52 is {}".format(services[52])
Service 0 is urp_i Service 52 is tftp_u
count
is the first node split in the tree. Remember that each partition is chosen greedily by selecting the best split from a set of possible splits, in order to maximize the information gain at a tree node (see more here). At a second level we find variables flag
(normal or error status of the connection) and dst_bytes
(the number of data bytes from destination to source) and so on.count
, dst_bytes
, and flag
.def create_labeled_point_minimal(line_split):
# leave_out = [41]
clean_line_split = line_split[3:4] + line_split[5:6] + line_split[22:23]
# convert flag to numeric categorical variable
try:
clean_line_split[0] = flags.index(clean_line_split[0])
except:
clean_line_split[0] = len(flags)
# convert label to binary label
attack = 1.0
if line_split[41]=='normal.':
attack = 0.0
return LabeledPoint(attack, array([float(x) for x in clean_line_split]))
training_data_minimal = csv_data.map(create_labeled_point_minimal)
test_data_minimal = test_csv_data.map(create_labeled_point_minimal)
# Build the model
t0 = time()
tree_model_minimal = DecisionTree.trainClassifier(training_data_minimal, numClasses=2,
categoricalFeaturesInfo={0: len(flags)},
impurity='gini', maxDepth=3, maxBins=32)
tt = time() - t0
print "Classifier trained in {} seconds".format(round(tt,3))
Classifier trained in 171.338 seconds
predictions_minimal = tree_model_minimal.predict(test_data_minimal.map(lambda p: p.features))
labels_and_preds_minimal = test_data_minimal.map(lambda p: p.label).zip(predictions_minimal)
t0 = time()
test_accuracy = labels_and_preds_minimal.filter(lambda (v, p): v == p).count() / float(test_data_minimal.count())
tt = time() - t0
print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4))
Prediction made in 17.446 seconds. Test accuracy is 0.9049
DataFrame
abstraction to perform a more structured exploratory data analysis.from pyspark import SparkContext
sc =SparkContext()
data_file = "/home/osboxes/Python with Spark - part 1/pydata/kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file).cache()
DataFrame
is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R or Pandas. They can be constructed from a wide array of sources such as a existing RDD in our case.SQLContext
class. To create a basic instance, all we need is a SparkContext
reference. Since we are running Spark in shell mode (using pySpark) we can use the global context object sc
for this purpose.from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
SQLContext
, we are ready to create a DataFrame
from our existing RDD. But first we need to tell Spark SQL the schema in our data.Row
objects to a DataFrame
. Rows are constructed by passing a list of key/value pairs as kwargs to the Row
class. The keys define the column names, and the types are inferred by looking at the first row. Therefore, it is important that there is no missing data in the first row of the RDD in order to properly infer the schema.from pyspark.sql import Row
csv_data = raw_data.map(lambda l: l.split(","))
row_data = csv_data.map(lambda p: Row(
duration=int(p[0]),
protocol_type=p[1],
service=p[2],
flag=p[3],
src_bytes=int(p[4]),
dst_bytes=int(p[5])
)
)
Row
we can infer and register the schema.interactions_df = sqlContext.createDataFrame(row_data)
interactions_df.registerTempTable("interactions")
# Select tcp network interactions with more than 1 second duration and no transfer from destination
tcp_interactions = sqlContext.sql("""
SELECT duration, dst_bytes FROM interactions WHERE protocol_type = 'tcp' AND duration > 1000 AND dst_bytes = 0
""")
tcp_interactions.show()
+--------+---------+ |duration|dst_bytes| +--------+---------+ | 5057| 0| | 5059| 0| | 5051| 0| | 5056| 0| | 5051| 0| | 5039| 0| | 5062| 0| | 5041| 0| | 5056| 0| | 5064| 0| | 5043| 0| | 5061| 0| | 5049| 0| | 5061| 0| | 5048| 0| | 5047| 0| | 5044| 0| | 5063| 0| | 5068| 0| | 5062| 0| +--------+---------+ only showing top 20 rows
# Output duration together with dst_bytes
tcp_interactions_out = tcp_interactions.map(lambda p: "Duration: {}, Dest. bytes: {}".format(p.duration, p.dst_bytes))
for ti_out in tcp_interactions_out.collect():
print ti_out
Duration: 5057, Dest. bytes: 0 Duration: 5059, Dest. bytes: 0 Duration: 5051, Dest. bytes: 0 Duration: 5056, Dest. bytes: 0 Duration: 5051, Dest. bytes: 0 Duration: 5039, Dest. bytes: 0 Duration: 5062, Dest. bytes: 0 Duration: 5041, Dest. bytes: 0 Duration: 5056, Dest. bytes: 0 Duration: 5064, Dest. bytes: 0 Duration: 5043, Dest. bytes: 0 Duration: 5061, Dest. bytes: 0 Duration: 5049, Dest. bytes: 0 Duration: 5061, Dest. bytes: 0 Duration: 5048, Dest. bytes: 0 Duration: 5047, Dest. bytes: 0 Duration: 5044, Dest. bytes: 0 Duration: 5063, Dest. bytes: 0 Duration: 5068, Dest. bytes: 0 Duration: 5062, Dest. bytes: 0 Duration: 5046, Dest. bytes: 0 Duration: 5052, Dest. bytes: 0 Duration: 5044, Dest. bytes: 0 Duration: 5054, Dest. bytes: 0 Duration: 5039, Dest. bytes: 0 Duration: 5058, Dest. bytes: 0 Duration: 5051, Dest. bytes: 0 Duration: 5032, Dest. bytes: 0 Duration: 5063, Dest. bytes: 0 Duration: 5040, Dest. bytes: 0 Duration: 5051, Dest. bytes: 0 Duration: 5066, Dest. bytes: 0 Duration: 5044, Dest. bytes: 0 Duration: 5051, Dest. bytes: 0 Duration: 5036, Dest. bytes: 0 Duration: 5055, Dest. bytes: 0 Duration: 2426, Dest. bytes: 0 Duration: 5047, Dest. bytes: 0 Duration: 5057, Dest. bytes: 0 Duration: 5037, Dest. bytes: 0 Duration: 5057, Dest. bytes: 0 Duration: 5062, Dest. bytes: 0 Duration: 5051, Dest. bytes: 0 Duration: 5051, Dest. bytes: 0 Duration: 5053, Dest. bytes: 0 Duration: 5064, Dest. bytes: 0 Duration: 5044, Dest. bytes: 0 Duration: 5051, Dest. bytes: 0 Duration: 5033, Dest. bytes: 0 Duration: 5066, Dest. bytes: 0 Duration: 5063, Dest. bytes: 0 Duration: 5056, Dest. bytes: 0 Duration: 5042, Dest. bytes: 0 Duration: 5063, Dest. bytes: 0 Duration: 5060, Dest. bytes: 0 Duration: 5056, Dest. bytes: 0 Duration: 5049, Dest. bytes: 0 Duration: 5043, Dest. bytes: 0 Duration: 5039, Dest. bytes: 0 Duration: 5041, Dest. bytes: 0 Duration: 42448, Dest. bytes: 0 Duration: 42088, Dest. bytes: 0 Duration: 41065, Dest. bytes: 0 Duration: 40929, Dest. bytes: 0 Duration: 40806, Dest. bytes: 0 Duration: 40682, Dest. bytes: 0 Duration: 40571, Dest. bytes: 0 Duration: 40448, Dest. bytes: 0 Duration: 40339, Dest. bytes: 0 Duration: 40232, Dest. bytes: 0 Duration: 40121, Dest. bytes: 0 Duration: 36783, Dest. bytes: 0 Duration: 36674, Dest. bytes: 0 Duration: 36570, Dest. bytes: 0 Duration: 36467, Dest. bytes: 0 Duration: 36323, Dest. bytes: 0 Duration: 36204, Dest. bytes: 0 Duration: 32038, Dest. bytes: 0 Duration: 31925, Dest. bytes: 0 Duration: 31809, Dest. bytes: 0 Duration: 31709, Dest. bytes: 0 Duration: 31601, Dest. bytes: 0 Duration: 31501, Dest. bytes: 0 Duration: 31401, Dest. bytes: 0 Duration: 31301, Dest. bytes: 0 Duration: 31194, Dest. bytes: 0 Duration: 31061, Dest. bytes: 0 Duration: 30935, Dest. bytes: 0 Duration: 30835, Dest. bytes: 0 Duration: 30735, Dest. bytes: 0 Duration: 30619, Dest. bytes: 0 Duration: 30518, Dest. bytes: 0 Duration: 30418, Dest. bytes: 0 Duration: 30317, Dest. bytes: 0 Duration: 30217, Dest. bytes: 0 Duration: 30077, Dest. bytes: 0 Duration: 25420, Dest. bytes: 0 Duration: 22921, Dest. bytes: 0 Duration: 22821, Dest. bytes: 0 Duration: 22721, Dest. bytes: 0 Duration: 22616, Dest. bytes: 0 Duration: 22516, Dest. bytes: 0 Duration: 22416, Dest. bytes: 0 Duration: 22316, Dest. bytes: 0 Duration: 22216, Dest. bytes: 0 Duration: 21987, Dest. bytes: 0 Duration: 21887, Dest. bytes: 0 Duration: 21767, Dest. bytes: 0 Duration: 21661, Dest. bytes: 0 Duration: 21561, Dest. bytes: 0 Duration: 21455, Dest. bytes: 0 Duration: 21334, Dest. bytes: 0 Duration: 21223, Dest. bytes: 0 Duration: 21123, Dest. bytes: 0 Duration: 20983, Dest. bytes: 0 Duration: 14682, Dest. bytes: 0 Duration: 14420, Dest. bytes: 0 Duration: 14319, Dest. bytes: 0 Duration: 14198, Dest. bytes: 0 Duration: 14098, Dest. bytes: 0 Duration: 13998, Dest. bytes: 0 Duration: 13898, Dest. bytes: 0 Duration: 13796, Dest. bytes: 0 Duration: 13678, Dest. bytes: 0 Duration: 13578, Dest. bytes: 0 Duration: 13448, Dest. bytes: 0 Duration: 13348, Dest. bytes: 0 Duration: 13241, Dest. bytes: 0 Duration: 13141, Dest. bytes: 0 Duration: 13033, Dest. bytes: 0 Duration: 12933, Dest. bytes: 0 Duration: 12833, Dest. bytes: 0 Duration: 12733, Dest. bytes: 0 Duration: 12001, Dest. bytes: 0 Duration: 5678, Dest. bytes: 0 Duration: 5010, Dest. bytes: 0 Duration: 1298, Dest. bytes: 0 Duration: 1031, Dest. bytes: 0 Duration: 36438, Dest. bytes: 0
printSchema
.interactions_df.printSchema()
root |-- dst_bytes: long (nullable = true) |-- duration: long (nullable = true) |-- flag: string (nullable = true) |-- protocol_type: string (nullable = true) |-- service: string (nullable = true) |-- src_bytes: long (nullable = true)
DataFrame
provides a domain-specific language for structured data manipulation. This language includes methods we can concatenate in order to do selection, filtering, grouping, etc. For example, let's say we want to count how many interactions are there for each protocol type. We can proceed as follows.from time import time
t0 = time()
interactions_df.select("protocol_type", "duration", "dst_bytes").groupBy("protocol_type").count().show()
tt = time() - t0
print "Query performed in {} seconds".format(round(tt,3))
+-------------+------+ |protocol_type| count| +-------------+------+ | udp| 20354| | tcp|190065| | icmp|283602| +-------------+------+ Query performed in 18.452 seconds
t0 = time()
interactions_df.select("protocol_type", "duration", "dst_bytes").filter(interactions_df.duration>1000).filter(interactions_df.dst_bytes==0).groupBy("protocol_type").count().show()
tt = time() - t0
print "Query performed in {} seconds".format(round(tt,3))
+-------------+-----+ |protocol_type|count| +-------------+-----+ | tcp| 139| +-------------+-----+ Query performed in 14.254 seconds
def get_label_type(label):
if label!="normal.":
return "attack"
else:
return "normal"
row_labeled_data = csv_data.map(lambda p: Row(
duration=int(p[0]),
protocol_type=p[1],
service=p[2],
flag=p[3],
src_bytes=int(p[4]),
dst_bytes=int(p[5]),
label=get_label_type(p[41])
)
)
interactions_labeled_df = sqlContext.createDataFrame(row_labeled_data)
t0 = time()
interactions_labeled_df.select("label").groupBy("label").count().show()
tt = time() - t0
print "Query performed in {} seconds".format(round(tt,3))
+------+------+ | label| count| +------+------+ |normal| 97278| |attack|396743| +------+------+ Query performed in 14.984 seconds
t0 = time()
interactions_labeled_df.select("label", "protocol_type").groupBy("label", "protocol_type").count().show()
tt = time() - t0
print "Query performed in {} seconds".format(round(tt,3))
+------+-------------+------+ | label|protocol_type| count| +------+-------------+------+ |attack| icmp|282314| |attack| udp| 1177| |attack| tcp|113252| |normal| icmp| 1288| |normal| udp| 19177| |normal| tcp| 76813| +------+-------------+------+ Query performed in 14.903 seconds
t0 = time()
interactions_labeled_df.select("label", "protocol_type", "dst_bytes").groupBy("label", "protocol_type", interactions_labeled_df.dst_bytes==0).count().show()
tt = time() - t0
print "Query performed in {} seconds".format(round(tt,3))
+------+-------------+---------------+------+ | label|protocol_type|(dst_bytes = 0)| count| +------+-------------+---------------+------+ |normal| icmp| true| 1288| |attack| udp| true| 1166| |attack| udp| false| 11| |normal| tcp| true| 9313| |normal| tcp| false| 67500| |attack| tcp| true|110583| |attack| tcp| false| 2669| |normal| udp| true| 3594| |normal| udp| false| 15583| |attack| icmp| true|282314| +------+-------------+---------------+------+ Query performed in 14.469 seconds
DataFrame
operations and data sources, have a look at the official documentation here.