Nothing
#
# bytescircle
#
# by Roberto S. Galende
# port of linux' bytes-circle to R
# v1.0, Nov 2016
# v1.1, Dec 2016
#
# licensed under GPL-3
#
#' Statistics About Bytes Contained in a File as a Circle Plot
#'
#' bytescircle is a function that shows statistics about bytes contained in a file
#' as a circle graph of deviations from mean in sigma increments.
#' Histogram and boxplot graphs can also be generated.
#'
#' The function can be useful for statistically analyze the content of files
#' in a glimpse: text files are shown as a green centered crown, compressed
#' and encrypted files should be shown as equally distributed variations with
#' a very low CV (sigma/mean), and other types of files can be classified between
#' these two categories depending on their text vs binary content, which can be
#' useful to quickly determine how information is stored inside them (databases,
#' multimedia files, etc).
#'
#' bytescircle() accepts a character string as path for the file, though if
#' it is not indicated, a file selection GUI will demand it.
#' The 'ascii=TRUE' param replicates the linux behaviour of bytes-circle command
#' with params '-o 1', or equivalently '-b', as RStudio or R output do not have
#' colour output.
#'
#' bytescircle() outputs data (file, mean, sd, CV, file size) on R console, but
#' this can be turned off using 'output=0'. A value of 2 will output the char
#' array used for ascii graph output.
#'
#' 'plot' param accepts a number from 0 (no plot) to 5 (boxplot)
#'
#' Colours can be indicated as a vector of colours from 1 to 3 elements, which
#' will be used differently depending on the plot selected. By default, the
#' first colour of the vector will replace the default green, the second the
#' default red, and the third the default blue. Not all colours are used on
#' every plot.
#'
#' bytescircle() can accept its own output as input using 'input=variable'.
#' This can be useful for generating a new graph without the hassle of R reading
#' and analysing the file again. The input can also be a bare 256 element vector:
#' in this case each element represents the appeareances in the file of that
#' [n-1] byte value.
#'
#' @param FILE char array with the path to an existing file to analyse
#' @param ascii boolean, if TRUE R will output an ascii circle char of deviations
#' from sigma (true sd). Each ascii char represents a different deviation from
#' sigma. The array of chars used (from -9/4 to +9/4 sigma, in increments
#' of 0.5 sigma) can be printed using parameter 'output=2'
#' @param plot number from 0 to 5, indicates plot to represent:
#'
#' 0: no plot
#'
#' 1: circle of bytes: using an archimedean spiral each byte value is represented
#' with a coloured circle which size indicates the amount of deviation from
#' sigma. A green colour indicates positive sigma value whilst red indicates
#' a negative sigma value. Blue little circles represents byte values that do
#' not appear in the file
#'
#' 2: circle of bytes with indication of the byte bucket represented
#'
#' 3: graph of byte counts: in green values over mean, in red values below it.
#' Also the lines for +/- sd over mean (black dotted line), IQR (Interquartile
#' Range) (dotted green line), and boxplot's binf and bsup values (dotted blue)
#' values are represented as horizontal lines
#'
#' 4: bar graph of byte counts
#'
#' 5: boxplot() graph of byte's data
#'
#' Note that ascii parameter's value is independent of the value of 'plot'
#'
#' @param col vector of color values, colours can be indicated as a vector of
#' colours from 1 to 3 elements, which will be used differently depending on the
#' plot selected. By default, the first colour of the vector will replace the
#' default green, the second the default red, and the third the default blue.
#' Not all colours are used on every plot.
#' @param output integer (0, 1, 2), as function outputs data (file, mean, sd,
#' CV, file size) on R console after every call, this output can be turned
#' off using 'output=0'. A value of 2 will output the char array used for ascii
#' graph output.
#' @param input factor or vector, the function can accept its own output as input.
#' This can be useful for generating a new graph without the hassle of R reading
#' and analysing the file again. The input can also be a bare 256 element vector:
#' in this case each element represents the appeareances in the file of that
#' [n-1] byte value.
#' @param restrict boolean, if TRUE statistics will use only the number of byte values
#' (buckets) that appear in the file, and not the 256 default value. This makes
#' a difference only if there're byte values that do not appear in the file.
#'
#' @return factor of values :
#'
#' $bytes: vector of 256 elements, counts of each byte value in the file
#'
#' $deviation: vector of 256 elements, (count-mean)/sigma for each byte value in the file
#'
#' $file: char array, input file analysed. If input were a variable, it is "R input"
#'
#' $mean: mean value
#'
#' $sd: sigma (true sd) value: sigma=sd()*sqrt((n-1)/n)
#'
#' $cv: coefficient of variation (mean/sigma*100)
#'
#' $circle: complex matrix representing an ascii circle: each element is the
#' deviation from sigma of the represented byte. Elements which do not
#' represent bytes get the value '0+1i'. See bytescircle's User Manual (R vignette).
#'
#' @examples
#' bytescircle( system.file("extdata", "gplv3.txt", package="bytescircle"),
#' ascii=TRUE, plot=1, output=2)
#'
#' # which bytes in this file have a sd greater than 2*sigma?
#' BYTES=bytescircle( system.file("extdata", "gplv3.txt.gz", package="bytescircle"), plot=3,
#' col=c("gold","blueviolet"));
#' which(BYTES$deviation>2.0)-1 # -1, 'cause BYTES[1] corresponds to byte 0
#'
#' # use a vector as input:
#' BYTES=c(256:1); bytescircle(input=BYTES,output=0)
#'
#' @author Roberto S. Galende <roberto.s.galende at gmail.com>
#'
#' @seealso
#' bytescircle's User Manual (R vignette).
#'
#' Origin of bytes-circle linux command: \url{https://circulosmeos.wordpress.com/2015/10/10/statistics-circle-for-analysing-byte-entropy-in-files/}
#'
#' Source code repository: \url{https://github.com/circulosmeos/bytescircle}
#'
#' @importFrom graphics abline axis boxplot legend par plot points text title
#' @importFrom stats IQR quantile sd
#' @importFrom utils capture.output
#'
#' @export
bytescircle = function ( FILE = "", ascii = FALSE, plot = 1, col = c(), output = 1, input = NULL, restrict = FALSE ) {
token = bytescircle.token()
# three colors can be assigned;
# otherwise they'll get default predefined values depending on graph type
if (plot < 4)
color=c(token$color_green1, token$color_red1, token$color_blue1)
else
color=c(token$color_green2, token$color_red2, token$color_blue1)
if (length(col)>3)
col=col[1:3]
if (length(col)>0)
color[1:length(col)]=col[1:length(col)]
# accept its own output as input, to generate new graphs
if ( ! is.null(input) ) {
FILE = "R input"
if ( is.null(names(input)) ) {
# input is a simple array
if (length(input)!=token$MAX_VALUE) {
stop(
capture.output(
cat("array passed has not proper length (", token$MAX_VALUE, " elements required).") )
)
} else {
BYTE = input
}
} else {
# input is the complete bytescircle() data frame output
if ( length(which(names(input) == "bytes") ) > 0 ) {
BYTE = input$bytes
if ( length(which(names(input) == "file") ) > 0 ) {
FILE = input$file
}
} else {
stop( "data frame passed has no $byte row name." )
}
}
SIZE = sum(BYTE)
} else {
# no data passed: a FILE must be read
if (nchar(FILE)==0) {
FILE = file.choose(new = FALSE)
}
# check file existence/readability
if (file.access(FILE, mode = 0) != 0) {
stop(
capture.output(
cat("file '", FILE, "' is not readable. Process aborted.") )
)
}
SIZE = file.info(FILE)$size
# initialize byte's bucket array counter
BYTE=c(rep(0,token$MAX_VALUE))
# open and load counts of bytes: R version (too slow for big files)
###to.read = file( FILE, "rb")
###LOOP = TRUE
###BLOCK_SIZE=65536
###BLOCKS=trunc(SIZE/BLOCK_SIZE)
###if (SIZE%%BLOCK_SIZE != 0) BLOCKS=BLOCKS+1
###n=1
###while ( n<=BLOCKS ) {
### bytes = readBin(to.read, n=BLOCK_SIZE, size=1, what="raw")
### # trying to reduce BLOCK_SIZE times the eval of length(bytes) in for loop, using BLOCK_SIZE :
### if (n == BLOCKS) # equivalent to: if (length(bytes)<BLOCK_SIZE)
### BLOCK_SIZE=SIZE-((BLOCKS-1)*BLOCK_SIZE)
### for (i in 1:BLOCK_SIZE) {
### byte = as.integer(bytes[i])+1
### BYTE[byte]=BYTE[byte]+1
### }
### n=n+1
###}
###close(to.read)
# open and load counts of bytes: .C version
BYTE = bytescircle_read_file( FILE, BYTE )
}
# counts and more counts on bytes
if (restrict==FALSE) {
#MEAN=SIZE/(token$MAX_VALUE)
MEAN=mean(BYTE)
# We want the Uncorrected sample standard deviation, not Corrected sample standard deviation
# see https://en.wikipedia.org/wiki/Standard_deviation#Uncorrected_sample_standard_deviation
#SIGMA=sqrt(sum((BYTE-MEAN)^2)/(token$MAX_VALUE)) # Uncorrected sample standard deviation
# R calculates Corrected sample standard deviation, so correct it:
SIGMA=sd(BYTE)*sqrt((token$MAX_VALUE-1)/token$MAX_VALUE)
} else {
if (length(BYTE[BYTE>0])>0) {
MEAN=mean(BYTE[BYTE>0])
SIGMA=sd(BYTE[BYTE>0])*sqrt((length(BYTE[BYTE>0])-1)/length(BYTE[BYTE>0]))
} else {
MEAN=0
SIGMA=0
}
}
if (SIGMA>0)
BYTES = list(bytes=BYTE,
deviation=(BYTE-MEAN)/SIGMA*4 )
else
BYTES = list(bytes=BYTE,
deviation=rep(0,token$MAX_VALUE) )
t = which(abs(BYTES$deviation)>=(token$MAX_SIGMA_CHAR) & BYTES$bytes!=0)
BYTES$deviation[ t[which(BYTES$deviation[t]>0)] ] = +token$MAX_SIGMA_CHAR
BYTES$deviation[ t[which(BYTES$deviation[t]<=0)]] = -token$MAX_SIGMA_CHAR
BYTES$deviation[which(BYTE==0)] = token$MISSING_CHAR_INDEX
# show graph using ascii art
circle=create.statistics.circle( BYTES, token )
if (ascii==TRUE) {
cat("\n") # line jump before ascii graph
rows.to.print=print.circle( circle, token )
}
# plot data as a circle graph of sd increment little circles
if (plot==1 | plot==2) {
coordinates=create.statistics.circle.to.plot( token )
plot.circle( BYTES, FILE, coordinates, color, token )
}
# plot data as a circle graph of sd increment little circles
# and add text info of bytes
if (plot==2) {
text(Re(coordinates),Im(coordinates),labels=c(0:255))
}
# graph just byte counts (with colors)
if (plot==3) {
par(mar = par()$mar+c(0, 0, 1, 0))
plot( which(BYTES$bytes>=MEAN)-1,
BYTES$bytes[which(BYTES$bytes>=MEAN)],
col=color[1],
pch=18,
xlim=c(1,token$MAX_VALUE),
ylim=c(min(BYTES$bytes),max(BYTES$bytes)),
xaxt = 'n',
xlab="byte #",
ylab="counts"
)
axis(3)
points( which(BYTES$bytes<MEAN & BYTES$bytes!=0)-1,
BYTES$bytes[which(BYTES$bytes<MEAN & BYTES$bytes!=0)],
col=color[2],
pch=18
)
points( which(BYTES$bytes==0)-1,
BYTES$bytes[which(BYTES$bytes==0)],
col=color[3],
pch=1
)
abline( h=MEAN )
# +/-sd
abline( h=(MEAN+SIGMA), lty=3)
abline( h=(MEAN-SIGMA), lty=3)
# IQR
abline( h=quantile(BYTES$bytes,0.25), lty=4, col=color[1])
abline( h=quantile(BYTES$bytes,0.75), lty=4, col=color[1])
# boxplot binf and bsup
abline( h=max(min(BYTES$bytes), quantile(BYTES$bytes,0.25)-1.5*IQR(BYTES$bytes)), lty=5, col=color[3])
abline( h=min(max(BYTES$bytes), quantile(BYTES$bytes,0.75)+1.5*IQR(BYTES$bytes)), lty=5, col=color[3])
par.xpd=par()$xpd
# Restore default clipping rect
par(xpd=TRUE, mar = par()$mar-c(0, 0, 1, 0))
title(main=FILE)
# legend
legend( "bottom", horiz=TRUE, bty='n', cex=0.8,
inset=c(0,-0.1),
legend=c("mean","sd","IQR","binf&bsup"),
col=c("black", "black", color[1], color[3]),
lty=c(1,3,4,5)
)
par( xpd=par.xpd )
}
# graph just byte counts (with colors)
if (plot==4) {
plot( c(0:255),
BYTES$bytes,
type="h",
xlab="byte",
ylab="counts",
main=FILE
)
}
# graph boxplot
if (plot==5) {
boxplot(BYTES$bytes,
ylab="counts",
main=FILE
)
# with mean value
points( MEAN, col=color[1], pch=18 )
}
if (Sys.info()['sysname'] == "Windows") {
FILE = gsub( "\\\\", "/" , FILE)
}
if (MEAN>0)
CV=SIGMA/MEAN*100
else
CV=0
if (output != 0 | ascii == TRUE) {
cat("file = ", FILE, "\n")
if (restrict == FALSE) {
cat("mean = ", round(MEAN, 3), "\n")
} else {
cat("mean = ", round(MEAN, 3),
"(", length(BYTES$bytes[BYTES$bytes>0]), "/", token$MAX_VALUE, "byte buckets)\n")
}
cat("sigma= ", round(SIGMA, 3), "( CV= ", round(CV,4), "% )", "\n")
readable_size = SIZE
i=1
while (readable_size>1024.0) {
i=i+1
readable_size=readable_size/1024.0
}
cat("size = ", round(readable_size,2), token$SIZE_UNITS[i], " (", SIZE, "bytes) \n")
if (output == 2 & ascii == TRUE)
cat("chars= ", token$sigma_char, " (0.5 sigma (", round(SIGMA/2, 3), ") each)\n")
}
# return zusammengetragen data
BYTES$file=FILE
BYTES$mean=MEAN
if (SIGMA>0) {
BYTES$deviation=(BYTE-MEAN)/SIGMA
} else {
BYTES$deviation=rep(0,token$MAX_VALUE)
}
BYTES$sd=SIGMA
BYTES$cv=CV
if (restrict == TRUE) {
explanation = capture.output(cat("calculated using", length(BYTES$bytes[BYTES$bytes>0]), "byte buckets"))
attr(BYTES$mean,"note")=explanation
attr(BYTES$sd, "note")=explanation
attr(BYTES$cv, "note")=explanation
}
attr(BYTES$cv,"description")="percentage value"
BYTES$circle=circle
invisible (BYTES)
}
#' @useDynLib bytescircle, .registration = TRUE
bytescircle_read_file <- function ( FILE, BYTE ) {
.C(bytescircle_read_file_, FILE, BYTE)[[2]]
}
# global variables
bytescircle.token = function () {
t = list()
t$CIRCLE_EMPTY_VALUE = 0+1i
t$MAX_X = 35
t$MAX_Y = 14
t$MAX_VALUE = 256 # total byte values: [1]==0 ... [255]==254! => [256]==255
t$INIT_X = round( t$MAX_X/2 )
t$INIT_Y = round( t$MAX_Y/2 )+1
t$sigma_char = c( '.', ',', '-', '~', '+', '*', 'o', 'O', '#', '@',
'=' ,
' ' )
t$MAX_SIGMA_CHAR = length( t$sigma_char ) -2
t$MISSING_CHAR = t$sigma_char[ length( t$sigma_char ) -1 ]
t$MISSING_CHAR_INDEX = length( t$sigma_char ) -1
t$EMPTY_CHAR = t$sigma_char[ length( t$sigma_char ) ]
t$EMPTY_CHAR_INDEX = length( t$sigma_char )
t$SIZE_UNITS = c( "bytes", "kiB", "MiB", "GiB", "TiB", "PiB" );
t$color_green1 = "darkolivegreen2"
t$color_red1 = "coral"
t$color_blue1 = "blue"
t$color_green2 = "green"
t$color_red2 = "red"
t
}
# returns a matrix of deviations appropriate for print.circle()
create.statistics.circle = function ( BYTES, token ) {
coordinates = c(rep(0,token$MAX_VALUE))
angle=0.0
inc_angle=5.0
r=1.0
inc_r=0.0184
proportion=0.5
x=token$INIT_X
y=token$INIT_Y
k=0 # unused counter
circle = matrix(rep(token$CIRCLE_EMPTY_VALUE,token$MAX_X*token$MAX_Y), nrow=token$MAX_Y)
circle[round(y),round(x)] = BYTES$deviation[1]
coordinates[1] =
complex( real = round(x), imaginary = round(y) )
x=x+1
circle[round(y),round(x)] = BYTES$deviation[2]
coordinates[2] =
complex( real = round(x), imaginary = round(y) )
for ( n in 3:token$MAX_VALUE) {
repeat {
k=k+1
angle=angle+inc_angle
r=r+inc_r
x=x-(cos(angle)*r)
y=y+(sin(angle)*r*proportion)
xx=trunc(x)
yy=trunc(y)
if (xx>token$MAX_X | yy>token$MAX_Y | xx<1 | yy<1) {
cat("Error creating circle of characters!") # but try to continue...
}
if ( circle[yy,xx] == token$CIRCLE_EMPTY_VALUE ) {
circle[yy,xx] = BYTES$deviation[n]
break
}
}
coordinates[n]=
complex(real = xx,imaginary = yy)
}
# coordinates is calculated... but actually never used
circle
}
# create a vector of coordinates for the archimedean spiral plot
create.statistics.circle.to.plot = function ( token ) {
coordinates = c(rep(0,token$MAX_VALUE))
x=0
y=0
r=0.3
theta=0
b=6
inc=0.001
step=0.2
for ( n in 1:token$MAX_VALUE) {
previous.length = length.on.archimedes.spiral(r)
repeat {
r = r + inc
if ( length.on.archimedes.spiral(r) >=
(previous.length+step) ) {
break;
}
}
theta = r*b
COORDS=porlar2cartesian(r, theta)
coordinates[n]=complex(real = COORDS[1] + x,
imaginary = COORDS[2] + y)
}
# slight manual corrections for a better visual appearance
coordinates[1]=-0.05+0.3i
coordinates[2]=coordinates[2]-0.1+0.1i
coordinates
}
length.on.archimedes.spiral = function ( theta ) {
u = atan(theta)
value.theta=0.5 * ( (1/cos(u))*tan(u) +
log(abs((1/cos(u)) + tan(u))) )
value.theta
}
porlar2cartesian = function ( r, theta ) {
x = r * cos(theta)
y = r * sin(theta)
c(x,y)
}
# prints on R output an ascii circle graph
# emulating the circle plot
print.circle = function ( circle, token ) {
rows.to.print = rep('',token$MAX_Y)
for (y in 1:token$MAX_Y) {
for (x in 1:token$MAX_X) {
if ( circle[y, x] != token$MISSING_CHAR_INDEX &
circle[y, x] != token$CIRCLE_EMPTY_VALUE ) {
deviation.index = trunc( Re(circle[y, x])/2 ) + trunc(token$MAX_SIGMA_CHAR/2) + 1
if (deviation.index > token$MAX_SIGMA_CHAR)
deviation.index = token$MAX_SIGMA_CHAR
rows.to.print[y] =
capture.output( cat(rows.to.print[y],
token$sigma_char[ deviation.index ],
sep='') )
} else {
if ( circle[y, x] == token$CIRCLE_EMPTY_VALUE ) {
rows.to.print[y] =
capture.output( cat(rows.to.print[y],
token$EMPTY_CHAR,
sep='') )
} else {
rows.to.print[y] =
capture.output( cat(rows.to.print[y],
token$MISSING_CHAR,
sep='') )
}
}
}
}
# print rows each one on a new line
cat(sprintf("%s\n",rows.to.print))
rows.to.print
}
# plots the archimedean spiral with different circle sizes and colours
plot.circle = function ( BYTES, FILE, coordinates, color, token ) {
plot(0,0,cex=0,
axes=FALSE, frame.plot=FALSE,
xlim = c(min(Re(coordinates)),max(Re(coordinates))),
ylim = c(min(Im(coordinates)),max(Im(coordinates))),
xlab = "",
ylab = "",
main = FILE
)
for (n in 1:token$MAX_VALUE) {
COLOR=color[1]
if ( BYTES$deviation[n] < 0 ) {
COLOR=color[2]
#big.value=abs(BYTES$deviation[n]/1.3)
big.value=abs(BYTES$deviation[n]/1.8)
} else {
big.value=abs(BYTES$deviation[n]/1.8)
}
if (big.value<0.5)
big.value=0.5
if ( BYTES$deviation[n] != token$MISSING_CHAR_INDEX ) {
points(Re(coordinates[n]), Im(coordinates[n]),
cex=big.value, pch=19,
col=COLOR)
} else {
points(Re(coordinates[n]), Im(coordinates[n]),
cex=0.5,
col=color[3])
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.