krangl / krangl

Package krangl

Contains all parts of the krangl API

Types

AggFun

data class AggFun

AnyCol

class AnyCol : DataCol

ArrayUtils

object ArrayUtils

BooleanCol

class BooleanCol : DataCol

ColNames

class ColNames

ColSpec

data class ColSpec

ColType

Methods to read and write tables into/from DataFrames see also https://commons.apache.org/proper/commons-csv/ for other implementations https://github.com/databricks/spark-csv https://examples.javacodegeeks.com/core-java/apache/commons/csv-commons/writeread-csv-files-with-apache-commons-csv-example/

enum class ColType

ColumnFormula

data class ColumnFormula

ColumnSelector

typealias ColumnSelector = ColNames.() -> List<Boolean?>

DataCol

abstract class DataCol

DataFrame

interface DataFrame

DataFrameRow

typealias DataFrameRow = Map<String, Any?>

DoubleCol

class DoubleCol : DataCol

ExpressionContext

A proxy on the df that exposes just parts of the DataFrame api that are relevant for table expressions

class ExpressionContext : TableContext

Factor

class Factor

InplaceDataFrameBuilder

class InplaceDataFrameBuilder

IntCol

class IntCol : NumberCol

JoinType

DataFrame joining

enum class JoinType

LongCol

class LongCol : NumberCol

NumberCol

abstract class NumberCol : DataCol

RenameRule

data class RenameRule

SleepPattern

data class SleepPattern

SortExpression

typealias SortExpression = SortingContext.(SortingContext) -> Any?

SortingContext

A proxy on the df that exposes just parts of the DataFrame api that are relevant for sorting

class SortingContext : TableContext

StringCol

class StringCol : DataCol

SumFormula

typealias SumFormula = DataCol.(DataCol) -> Any?

SumFuns

Common ggregation Functions to be used along `summarizeAt/If/All

object SumFuns

SummarizeBuilder

class SummarizeBuilder

TableContext

open class TableContext

TableExpression

typealias TableExpression = ExpressionContext.(ExpressionContext) -> Any?

UnequalByHelpers

object UnequalByHelpers

VectorizedRowPredicate

typealias VectorizedRowPredicate = ExpressionContext.(ExpressionContext) -> BooleanArray

Exceptions

ColumnTypeCastException

class ColumnTypeCastException : RuntimeException

DuplicatedColumnNameException

class DuplicatedColumnNameException : RuntimeException

InvalidColumnOperationException

class InvalidColumnOperationException : RuntimeException

InvalidColumnSelectException

class InvalidColumnSelectException : RuntimeException

InvalidSortingPredicateException

class InvalidSortingPredicateException : RuntimeException

MissingValueException

Thrown if an operation is applied to a column that contains missing values.

class MissingValueException : Throwable

NonScalarValueException

class NonScalarValueException : RuntimeException

Extensions for External Classes

kotlin.Array

kotlin.BooleanArray

kotlin.collections.Iterable

kotlin.collections.List

kotlin.String

Properties

_rand

Random number generator used to row sampling. Reassignable to set seed for deterministic sampling.

var _rand: Random

DEF_NEST_COLUMN_NAME

const val DEF_NEST_COLUMN_NAME: String

flightsData

On-time data for all flights that departed NYC (i.e. JFK, LGA or EWR) in 2013.

val flightsData: DataFrame

irisData

This famous (Fisher's or Anderson's) iris data set gives the measurements in centimeters of the variables sepal length and width and petal length and width, respectively, for 50 flowers from each of 3 species of iris. The species are Iris setosa, versicolor, and virginica.

val irisData: DataFrame

MISSING_VALUE

val MISSING_VALUE: String

PRINT_MAX_DIGITS

var PRINT_MAX_DIGITS: Int

PRINT_MAX_ROWS

var PRINT_MAX_ROWS: Int

PRINT_MAX_WIDTH

var PRINT_MAX_WIDTH: Int

PRINT_ROW_NUMBERS

var PRINT_ROW_NUMBERS: Boolean

sleepData

An example data frame with 83 rows and 11 variables

val sleepData: DataFrame

sleepPatterns

val sleepPatterns: List<SleepPattern>

Functions

addRowNumber

Add the row-number as column to a data-frame.

fun DataFrame.addRowNumber(name: String = "row_number"): DataFrame

all

fun ColNames.all(): List<Boolean>

asFactor

fun DataCol.asFactor(): DataCol

asString

fun DataFrame.asString(title: String = "A DataFrame", colNames: Boolean = true, maxRows: Int = PRINT_MAX_ROWS, maxWidth: Int = PRINT_MAX_WIDTH, maxDigits: Int = PRINT_MAX_DIGITS, rowNumbers: Boolean = PRINT_ROW_NUMBERS): String

asType

fun <R> DataCol.asType(): Array<R?>

bindCols

fun bindCols(left: DataFrame, right: DataFrame, renameDuplicates: Boolean = true): DataFrame

bindRows

Adds new rows. Missing entries are set to null. The output of bindRows() will contain a column if that column appears in any of the inputs.

fun DataFrame.bindRows(df: DataFrame): DataFrame
fun DataFrame.bindRows(vararg someRows: DataFrameRow): DataFrame
fun bindRows(vararg dataFrames: DataFrame): DataFrame

columnTypes

fun columnTypes(df: DataFrame): List<ColSpec>

complete

Turns implicit missing values into explicit missing values. This is a wrapper around expand(), dplyr::left_join() and replace_na() that's useful for completing missing combinations of data.

fun DataFrame.complete(vararg columnNames: String): DataFrame

const

fun ExpressionContext.const(someThing: Any): DataCol

count

Counts observations by group.

fun DataFrame.count(vararg selects: String, name: String = "n"): DataFrame

countExpr

Counts expressions

fun DataFrame.countExpr(vararg moreExpressions: TableExpression, name: String = "n", tableExpression: TableExpression? = null): DataFrame

cumSum

Calculates the cumulative sum of the column values.

fun DataCol.cumSum(): DataCol

dataFrameOf

Create a new data frame in place.

fun dataFrameOf(vararg header: String): InplaceDataFrameBuilder
fun dataFrameOf(header: Iterable<String>): InplaceDataFrameBuilder

Create a new data-frame from a list of DataCol instances

fun dataFrameOf(vararg columns: DataCol): DataFrame

Create a new data-frame from a records encoded as key-value maps.

fun dataFrameOf(rows: Iterable<DataFrameRow>): DataFrame

desc

Creates a sorting attribute that inverts the order of the argument

fun DataCol.desc(): IntCol

distinct

Retain only unique/distinct rows from an input tbl.

fun DataFrame.distinct(vararg selects: String = this.names.toTypedArray()): DataFrame

emptyDataFrame

fun emptyDataFrame(): DataFrame

endsWith

fun ColNames.endsWith(prefix: String): List<Boolean?>

eq

infix fun DataCol.eq(i: Any): BooleanArray

except

Performs a negative selection by selecting all columns except the listed ones.

fun ColNames.except(vararg columns: String): List<Boolean?>fun ColNames.except(columnSelector: ColumnSelector): List<Boolean?>

expand

expand() is often useful in conjunction with left_join if you want to convert implicit missing values to explicit missing values. Or you can use it in conjunction with anti_join() to figure out which combinations are missing.

fun DataFrame.expand(vararg columnNames: String): DataFrame

filter

Filter the rows of a table with a single predicate.

fun DataFrame.filter(predicate: ExpressionContext.(ExpressionContext) -> List<Boolean?>): DataFrame

AND-filter a table with different filters.

fun DataFrame.filter(vararg predicates: DataFrame.(DataFrame) -> List<Boolean>): DataFrame

gather

Gather takes multiple columns and collapses into key-value pairs, duplicating all other columns as needed. You use gather() when you notice that you have columns that are not variables.

fun DataFrame.gather(key: String, value: String, columns: List<String> = this.names, convert: Boolean = false): DataFramefun DataFrame.gather(key: String, value: String, columns: ColumnSelector, convert: Boolean = false): DataFrame

ge

infix fun DataCol.ge(i: Number): BooleanArray
infix fun DataCol.ge(i: DataCol): BooleanArray

greaterEqualsThan

fun DataCol.greaterEqualsThan(i: Number): BooleanArray
fun DataCol.greaterEqualsThan(i: DataCol): BooleanArray

greaterThan

fun DataCol.greaterThan(i: Number): BooleanArray
fun DataCol.greaterThan(i: DataCol): BooleanArray

groupBy

Creates a grouped data-frame from a column selector function. See select() for details about column selection.

fun DataFrame.groupBy(columnSelect: ColumnSelector): DataFrame

groupByExpr

Creates a grouped data-frame from one or more table expressions. See addColumn() for details about table expressions.

fun DataFrame.groupByExpr(vararg moreExpressions: TableExpression, tableExpression: TableExpression? = null): DataFrame

gt

infix fun DataCol.gt(i: Number): BooleanArray
infix fun DataCol.gt(i: DataCol): BooleanArray

head

fun DataFrame.head(numRows: Int = 5): DataFrame

innerJoin

fun DataFrame.innerJoin(right: DataFrame, by: String, suffices: Pair<String, String> = ".x" to ".y"): DataFrame
fun DataFrame.innerJoin(right: DataFrame, by: Iterable<String> = defaultBy(this, right), suffices: Pair<String, String> = ".x" to ".y"): DataFrame

isEqualTo

infix fun DataCol.isEqualTo(i: Any): BooleanArray

isListOfType

fun <T> isListOfType(items: List<Any?>): Boolean

isMatching

Match a text column in a NA-aware manner to create a predicate vector for filtering.

fun <T> DataCol.isMatching(missingAs: Boolean = false, filter: T.() -> Boolean): BooleanArray

isNA

Maps a column to true for the NA values and false otherwise.

fun DataCol.isNA(): BooleanArrayfun ExpressionContext.isNA(columnName: String): BooleanArray

isNotNA

fun DataCol.isNotNA(): BooleanArray
fun ExpressionContext.isNotNA(columnName: String): BooleanArray

isOfType

Test if for first non-null elemeent in list if it has specific type bu peeking into it from the top.

fun <T> isOfType(items: Array<Any?>): Boolean

join

fun join(left: DataFrame, right: DataFrame, by: Iterable<String> = defaultBy(left, right), suffices: Pair<String, String> = ".x" to ".y", type: JoinType): DataFrame

lag

Returns the "previous" column values. Useful for comparing values behind the current values.

fun DataCol.lag(n: Int = 1, default: Any? = null): DataCol

le

infix fun DataCol.le(i: Number): BooleanArray
infix fun DataCol.le(i: DataCol): BooleanArray

lead

Returns the "next" column values. Useful for comparing values ahead of the current values.

fun DataCol.lead(n: Int = 1, default: Any? = null): DataCol

leftJoin

Convenience wrapper around joinLeft that works with single single by attribute.

fun DataFrame.leftJoin(right: DataFrame, by: String, suffices: Pair<String, String> = ".x" to ".y"): DataFramefun DataFrame.leftJoin(right: DataFrame, by: Iterable<String> = defaultBy(this, right), suffices: Pair<String, String> = ".x" to ".y"): DataFrame

lesserEquals

fun DataCol.lesserEquals(i: Number): BooleanArray
fun DataCol.lesserEquals(i: DataCol): BooleanArray

lesserThan

fun DataCol.lesserThan(i: Number): BooleanArray
fun DataCol.lesserThan(i: DataCol): BooleanArray

listOf

fun ColNames.listOf(vararg someNames: String): List<Boolean?>
fun ColNames.listOf(someNames: List<String>): List<Boolean?>

lt

infix fun DataCol.lt(i: Number): BooleanArray
infix fun DataCol.lt(i: DataCol): BooleanArray

main

fun main(args: Array<String>): Unit

map

Allows to transform column data into list of same length ignoring missing values, which are kept but processing can be done in a non-null manner.

fun <T> DataCol.map(expr: (T) -> Any?): List<Any?>

matches

fun ColNames.matches(regex: String): List<Boolean?>
fun ColNames.matches(regex: Regex): List<Boolean?>

max

Calculates the maximum of the column values.

fun DataCol.max(removeNA: Boolean = false): Double?

mean

Calculates the arithmetic mean of the column values.

fun DataCol.mean(removeNA: Boolean = false): Double?

median

Calculates the median of the column values.

fun DataCol.median(removeNA: Boolean = false): Double?

min

Calculates the minimum of the column values.

fun DataCol.min(removeNA: Boolean = false): Double?

moveLeft

Push some columns to the left end of a data-frame.

fun DataFrame.moveLeft(vararg columnNames: String): DataFrame

moveRight

Push some columns to the right end of a data-frame.

fun DataFrame.moveRight(vararg columnNames: String): DataFrame

nest

Nest repeated values in a list-variable.

fun DataFrame.nest(colSelect: ColumnSelector = { except(*groupedBy().names.toTypedArray()) }, columnName: String = DEF_NEST_COLUMN_NAME): DataFrame

order

rank returns the order of each element in an ascending list

fun DataCol.order(naLast: Boolean = true): List<Int>

outerJoin

fun DataFrame.outerJoin(right: DataFrame, by: Iterable<String> = defaultBy(this, right)): DataFrame

pctChange

Calculates the percentage change between the current and a prior column value.

fun DataCol.pctChange(): DataCol

print

fun DataFrame.print(title: String = "A DataFrame", colNames: Boolean = true, maxRows: Int = PRINT_MAX_ROWS, maxWidth: Int = PRINT_MAX_WIDTH, maxDigits: Int = PRINT_MAX_DIGITS, rowNumbers: Boolean = PRINT_ROW_NUMBERS): Unit

printDataClassSchema

Provides a code to convert a dataframe to a strongly typed list of kotlin data-class instances.

fun DataFrame.printDataClassSchema(dataClassName: String, receiverVarName: String = "dataFrame"): Unit

range

fun ColNames.range(from: String, to: String): List<Boolean?>

rank

fun DataCol.rank(naLast: Boolean = true): List<Int>

remove

Remove columns by column type

fun <T : DataCol> DataFrame.remove(): DataFrame

rename

fun DataFrame.rename(vararg old2new: Pair<String, String>): DataFrame

Rename one or several columns. Positions should be preserved

fun DataFrame.rename(vararg old2new: RenameRule): DataFrame

rowsAs

Convert rows into objects by using reflection. Only parameters used in constructor will be mapped. Note: This is tested with kotlin data classes only. File a ticket for better type compatibility or any issues!

fun <T> DataFrame.rowsAs(mapping: Map<String, String> = names.map { it to it }.toMap()): Iterable<T>

rowwise

Creates a grouped data-frame where each group consists of exactly one line. Thereby the row-number is used a group-hash.

fun DataFrame.rowwise(): DataFrame

sampleFrac

Select random rows from a table. If receiver is grouped, sampling is done per group.

fun DataFrame.sampleFrac(fraction: Double, replace: Boolean = false): DataFrame

sampleN

Select random rows from a table. If receiver is grouped, sampling is done per group.

fun DataFrame.sampleN(n: Int, replace: Boolean = false): DataFrame

schema

Prints the schema (that is column names, types, and the first few values per column) of a dataframe to stdout.

fun DataFrame.schema(maxDigits: Int = 3, maxWidth: Int = PRINT_MAX_WIDTH): Unit

sd

Calculates the standard deviation of the column values.

fun DataCol.sd(removeNA: Boolean = false): Double?

select

Select columns by column type

fun <T : DataCol> DataFrame.select(): DataFrame

semiJoin

fun DataFrame.semiJoin(right: DataFrame, by: String): DataFrame
fun DataFrame.semiJoin(right: DataFrame, by: Iterable<Pair<String, String>>): DataFrame
fun DataFrame.semiJoin(right: DataFrame, by: Iterable<String> = defaultBy(this, right), suffices: Pair<String, String> = ".x" to ".y"): DataFrame

separate

Given either regular expression or a vector of character positions, separate() turns a single character column into multiple columns.

fun DataFrame.separate(column: String, into: List<String>, sep: String = "[^\\w]", remove: Boolean = true, convert: Boolean = false): DataFrame

setNames

Replace current column names with new ones. The number of provided names must match the number of columns.

fun DataFrame.setNames(vararg newNames: String): DataFrame

shuffle

Randomize the row order of a data-frame.

fun DataFrame.shuffle(): DataFrame

slice

Select rows by position while taking into account grouping in a data-frame.

fun DataFrame.slice(vararg slices: Int): DataFrame
fun DataFrame.slice(slice: IntRange): DataFrame

sortedBy

fun DataFrame.sortedBy(sortExpression: SortExpression): DataFrame
fun DataFrame.sortedBy(vararg sortExpressions: SortExpression): DataFrame

spread

Spread a key-value pair across multiple columns.

fun DataFrame.spread(key: String, value: String, fill: Any? = null, convert: Boolean = false): DataFrame

startsWith

fun ColNames.startsWith(prefix: String): List<Boolean?>

sum

Calculates the arithmetic mean of the column values.

fun DataCol.sum(removeNA: Boolean = false): Number?

summarize

fun DataFrame.summarize(name: String, tableExpression: TableExpression): DataFrame

summarizeAt

fun DataFrame.summarizeAt(columnSelect: ColumnSelector, vararg aggfuns: AggFun): DataFrame
fun DataFrame.summarizeAt(columnSelect: ColumnSelector, op: (SummarizeBuilder.() -> Unit)? = null): DataFrame

tail

fun DataFrame.tail(numRows: Int = 5): DataFrame

take

fun DataFrame.take(numRows: Int = 5): DataFrame

takeLast

fun DataFrame.takeLast(numRows: Int): DataFrame

toBooleans

fun DataCol.toBooleans(): Array<Boolean?>

toDoubleMatrix

fun DataFrame.toDoubleMatrix(): Array<DoubleArray>

toDoubles

fun DataCol.toDoubles(): Array<Double?>

toFloatMatrix

fun DataFrame.toFloatMatrix(): Array<FloatArray>

toInts

fun DataCol.toInts(): Array<Int?>

toLongs

fun DataCol.toLongs(): Array<Long?>

toMap

Expose a view on the data as map from column names to nullable arrays.

fun DataFrame.toMap(): Map<String, Array<*>>

toStrings

fun DataCol.toStrings(): Array<String?>

unfold

fun <T> DataFrame.unfold(columnName: String, properties: List<String> = detectPropertiesByReflection<T>().map { it.name }, keep: Boolean = true): DataFrame

unite

Convenience function to paste together multiple columns into one.

fun DataFrame.unite(colName: String, which: List<String>, sep: String = "_", remove: Boolean = true): DataFramefun DataFrame.unite(colName: String, vararg which: ColNames.() -> List<Boolean?>, sep: String = "_", remove: Boolean = true): DataFrame

unnest

If you have a list-column, this makes each element of the list its own row. It unfolds data vertically. unnest() can handle list-columns that can atomic vectors, lists, or data frames (but not a mixture of the different types).

fun DataFrame.unnest(columnName: String): DataFrame

writeCSV

fun DataFrame.writeCSV(file: File, format: CSVFormat = CSVFormat.DEFAULT.withHeader(*names.toTypedArray())): Unit

writeExcel

fun DataFrame.writeExcel(filePath: String, sheetName: String, headers: Boolean = true, eraseFile: Boolean = false, boldHeaders: Boolean = true): Unit

writeTSV

fun DataFrame.writeTSV(file: File, format: CSVFormat = CSVFormat.TDF.withHeader(*names.toTypedArray())): Unit

Companion Object Functions

builder

Create a new data frame in place.

fun DataFrame.Companion.builder(vararg header: String): InplaceDataFrameBuilder

fromJson

fun DataFrame.Companion.fromJson(file: File): DataFrame
fun DataFrame.Companion.fromJson(fileOrUrl: String): DataFrame
fun DataFrame.Companion.fromJson(url: URL): DataFrame

fromJsonString

fun DataFrame.Companion.fromJsonString(jsonData: String): DataFrame

fromRecords

Create a data-frame from a list of objects

fun <T> DataFrame.Companion.fromRecords(records: Iterable<T>, mapping: (T) -> DataFrameRow): DataFrame

fromResultSet

fun DataFrame.Companion.fromResultSet(rs: ResultSet): DataFrame

readCSV

fun DataFrame.Companion.readCSV(fileOrUrl: String, format: CSVFormat = CSVFormat.DEFAULT.withHeader(), colTypes: Map<String, ColType> = mapOf()): DataFrame
fun DataFrame.Companion.readCSV(file: File, format: CSVFormat = CSVFormat.DEFAULT.withHeader(), colTypes: Map<String, ColType> = mapOf()): DataFrame

readDelim

fun DataFrame.Companion.readDelim(uri: URI, format: CSVFormat = CSVFormat.DEFAULT.withHeader(), isCompressed: Boolean = uri.toURL().toString().endsWith(".gz"), colTypes: Map<String, ColType> = mapOf()): DataFrame
fun DataFrame.Companion.readDelim(inStream: InputStream, format: CSVFormat = CSVFormat.DEFAULT.withHeader(), isCompressed: Boolean = false, colTypes: Map<String, ColType> = mapOf()): DataFrame
fun DataFrame.Companion.readDelim(reader: Reader, format: CSVFormat = CSVFormat.DEFAULT.withHeader(), colTypes: Map<String, ColType> = mapOf(), skip: Int = 0): DataFrame

readExcel

Returns a DataFrame with the contents from an Excel file sheet

fun DataFrame.Companion.readExcel(path: String, sheet: Int = 0, cellRange: CellRangeAddress? = null, colTypes: Map<String, ColType> = mapOf(), trim_ws: Boolean = false, guessMax: Int = 100, na: String = MISSING_VALUE, stopAtBlankLine: Boolean = true, includeBlankLines: Boolean = false): DataFrame
fun DataFrame.Companion.readExcel(path: String, sheet: String, cellRange: CellRangeAddress? = null, colTypes: Map<String, ColType> = mapOf(), trim_ws: Boolean = false, guessMax: Int = 100, na: String = MISSING_VALUE, stopAtBlankLine: Boolean = true, includeBlankLines: Boolean = false): DataFrame

readTSV

fun DataFrame.Companion.readTSV(fileOrUrl: String, format: CSVFormat = CSVFormat.TDF.withHeader(), colTypes: Map<String, ColType> = mapOf()): DataFrame
fun DataFrame.Companion.readTSV(file: File, format: CSVFormat = CSVFormat.TDF.withHeader(), colTypes: Map<String, ColType> = mapOf()): DataFrame