krangl / krangl / DataFrame

DataFrame

interface DataFrame

Properties

cols

Returns the ordered list of column this data-frame.

abstract val cols: List<DataCol>

names

Returns the ordered list of column names of this data-frame.

abstract val names: List<String>

ncol

abstract val ncol: Int

nrow

abstract val nrow: Int

rows

abstract val rows: Iterable<DataFrameRow>

Functions

addColumn

Adds new variables and preserves existing.

abstract fun addColumn(tf: ColumnFormula): DataFrameopen fun addColumn(columnName: String, expression: TableExpression): DataFrame

addColumns

open fun addColumns(vararg columnFormulas: ColumnFormula): DataFrame

addRow

Returns a DataFrame containing the new row. The new row must be a list whose length must match the number of columns in the DataFrame

open fun addRow(row: List<Any?>): DataFrame

filter

abstract fun filter(predicate: VectorizedRowPredicate): DataFrame

filterByRow

open fun filterByRow(rowFilter: DataFrameRow.(DataFrameRow) -> Boolean): DataFrame

get

Returns a column by name.

abstract operator fun get(columnName: String): DataCol

Returns a column by index.

open operator fun get(columnIndex: Int): DataCol

groupBy

Creates a grouped data-frame given a list of grouping attributes.

abstract fun groupBy(vararg by: String): DataFrame

groupedBy

Returns a data-frame of distinct grouping variable tuples for a grouped data-frame. An empty data-frame for ungrouped data.

abstract fun groupedBy(): DataFrame

groups

Returns the groups of a grouped data frame or just a reference to this object if not.

abstract fun groups(): List<DataFrame>

remove

Remove selected columns.

open fun remove(vararg columns: String): DataFrame
open fun remove(columns: Iterable<String>): DataFrame
open fun remove(vararg columSelects: ColumnSelector): DataFrame

Remove selected columns using a predicate

open fun remove(columnSelect: ColumnSelector): DataFrame

removeIf

Select or remove columns by predicate.

open fun removeIf(colSelector: (DataCol) -> Boolean): DataFrame

row

Returns a row by index

abstract fun row(rowIndex: Int): DataFrameRow

select

Create a new data frame with only the selected columns.

abstract fun select(vararg columns: String): DataFrame
open fun select(vararg columns: ColumnSelector): DataFrame

Convenience wrapper around to work with varag krangl.DataFrame.select

open fun select(columns: Iterable<String>): DataFrame

Keeps only the variables that match any of the given expressions.

open fun select(columnSelect: ColumnSelector): DataFrame

selectIf

Select or remove columns by predicate.

open fun selectIf(colSelector: (DataCol) -> Boolean): DataFrame

sortedBy

Resorts the receiver in ascending order (small values to go top of table). The first argument defines the primary attribute to sort by. Additional ones are used to resolve ties.

abstract fun sortedBy(vararg by: String): DataFrame

sortedByDescending

Resorts the receiver in descending order (small values to go bottom of table). The first argument defines the primary attribute to sort by. Additional ones are used to resolve ties.

open fun sortedByDescending(vararg by: String): DataFrame

summarize

Creates a summary of a table or a group. The provided expression is expected to evaluate to a scalar value and not into a column. summarise() is typically used on grouped data created by group_by(). The output will have one row for each group.

abstract fun summarize(vararg sumRules: ColumnFormula): DataFrame

transmute

Create a new dataframe based on a list of column-formulas which are evaluated in the context of the this instance.

open fun transmute(vararg formula: ColumnFormula): DataFrame

ungroup

Removes the grouping (if present from a data frame.

abstract fun ungroup(): DataFrame

Extension Functions

addRowNumber

Add the row-number as column to a data-frame.

fun DataFrame.addRowNumber(name: String = "row_number"): DataFrame

asString

fun DataFrame.asString(title: String = "A DataFrame", colNames: Boolean = true, maxRows: Int = PRINT_MAX_ROWS, maxWidth: Int = PRINT_MAX_WIDTH, maxDigits: Int = PRINT_MAX_DIGITS, rowNumbers: Boolean = PRINT_ROW_NUMBERS): String

bindRows

Adds new rows. Missing entries are set to null. The output of bindRows() will contain a column if that column appears in any of the inputs.

fun DataFrame.bindRows(df: DataFrame): DataFrame
fun DataFrame.bindRows(vararg someRows: DataFrameRow): DataFrame

complete

Turns implicit missing values into explicit missing values. This is a wrapper around expand(), dplyr::left_join() and replace_na() that's useful for completing missing combinations of data.

fun DataFrame.complete(vararg columnNames: String): DataFrame

count

Counts observations by group.

fun DataFrame.count(vararg selects: String, name: String = "n"): DataFrame

countExpr

Counts expressions

fun DataFrame.countExpr(vararg moreExpressions: TableExpression, name: String = "n", tableExpression: TableExpression? = null): DataFrame

distinct

Retain only unique/distinct rows from an input tbl.

fun DataFrame.distinct(vararg selects: String = this.names.toTypedArray()): DataFrame

expand

expand() is often useful in conjunction with left_join if you want to convert implicit missing values to explicit missing values. Or you can use it in conjunction with anti_join() to figure out which combinations are missing.

fun DataFrame.expand(vararg columnNames: String): DataFrame

filter

Filter the rows of a table with a single predicate.

fun DataFrame.filter(predicate: ExpressionContext.(ExpressionContext) -> List<Boolean?>): DataFrame

AND-filter a table with different filters.

fun DataFrame.filter(vararg predicates: DataFrame.(DataFrame) -> List<Boolean>): DataFrame

flatten

Unfold all list columns vertically and all objects properties horizontally.

fun DataFrame.flatten(): DataFrame

gather

Gather takes multiple columns and collapses into key-value pairs, duplicating all other columns as needed. You use gather() when you notice that you have columns that are not variables.

fun DataFrame.gather(key: String, value: String, columns: List<String> = this.names, convert: Boolean = false): DataFramefun DataFrame.gather(key: String, value: String, columns: ColumnSelector, convert: Boolean = false): DataFrame

groupBy

Creates a grouped data-frame from a column selector function. See select() for details about column selection.

fun DataFrame.groupBy(columnSelect: ColumnSelector): DataFrame

groupByExpr

Creates a grouped data-frame from one or more table expressions. See addColumn() for details about table expressions.

fun DataFrame.groupByExpr(vararg moreExpressions: TableExpression, tableExpression: TableExpression? = null): DataFrame

head

fun DataFrame.head(numRows: Int = 5): DataFrame

innerJoin

fun DataFrame.innerJoin(right: DataFrame, by: String, suffices: Pair<String, String> = ".x" to ".y"): DataFrame
fun DataFrame.innerJoin(right: DataFrame, by: Iterable<String> = defaultBy(this, right), suffices: Pair<String, String> = ".x" to ".y"): DataFrame

leftJoin

Convenience wrapper around joinLeft that works with single single by attribute.

fun DataFrame.leftJoin(right: DataFrame, by: String, suffices: Pair<String, String> = ".x" to ".y"): DataFramefun DataFrame.leftJoin(right: DataFrame, by: Iterable<String> = defaultBy(this, right), suffices: Pair<String, String> = ".x" to ".y"): DataFrame

moveLeft

Push some columns to the left end of a data-frame.

fun DataFrame.moveLeft(vararg columnNames: String): DataFrame

moveRight

Push some columns to the right end of a data-frame.

fun DataFrame.moveRight(vararg columnNames: String): DataFrame

nest

Nest repeated values in a list-variable.

fun DataFrame.nest(colSelect: ColumnSelector = { except(*groupedBy().names.toTypedArray()) }, columnName: String = DEF_NEST_COLUMN_NAME): DataFrame

oneHot

Performs a one-hot encoding of the specified Anycolumn.

fun <T> DataFrame.oneHot(columnName: String, naValue: String = "NA", categorizeWith: (T?) -> String? = { it?.toString() }): DataFrame

Performs a one-hot encoding of the specified column.

fun DataFrame.oneHot(columnName: String): DataFrame

outerJoin

fun DataFrame.outerJoin(right: DataFrame, by: Iterable<String> = defaultBy(this, right)): DataFrame

print

fun DataFrame.print(title: String = "A DataFrame", colNames: Boolean = true, maxRows: Int = PRINT_MAX_ROWS, maxWidth: Int = PRINT_MAX_WIDTH, maxDigits: Int = PRINT_MAX_DIGITS, rowNumbers: Boolean = PRINT_ROW_NUMBERS): Unit

printDataClassSchema

Provides a code to convert a dataframe to a strongly typed list of kotlin data-class instances.

fun DataFrame.printDataClassSchema(dataClassName: String, receiverVarName: String = "dataFrame"): Unit

remove

Remove columns by column type

fun <T : DataCol> DataFrame.remove(): DataFrame

rename

fun DataFrame.rename(vararg old2new: Pair<String, String>): DataFrame

Rename one or several columns. Positions should be preserved

fun DataFrame.rename(vararg old2new: RenameRule): DataFrame

rowsAs

Convert rows into objects by using reflection. Only parameters used in constructor will be mapped. Note: This is tested with kotlin data classes only. File a ticket for better type compatibility or any issues!

fun <T> DataFrame.rowsAs(mapping: Map<String, String> = names.map { it to it }.toMap()): Iterable<T>

rowwise

Creates a grouped data-frame where each group consists of exactly one line. Thereby the row-number is used a group-hash.

fun DataFrame.rowwise(): DataFrame

sampleFrac

Select random rows from a table. If receiver is grouped, sampling is done per group.

fun DataFrame.sampleFrac(fraction: Double, replace: Boolean = false): DataFrame

sampleN

Select random rows from a table. If receiver is grouped, sampling is done per group.

fun DataFrame.sampleN(n: Int, replace: Boolean = false): DataFrame

schema

Prints the schema (that is column names, types, and the first few values per column) of a dataframe to stdout.

fun DataFrame.schema(maxDigits: Int = 3, maxWidth: Int = PRINT_MAX_WIDTH): Unit

select

Select columns by column type

fun <T : DataCol> DataFrame.select(): DataFrame

semiJoin

fun DataFrame.semiJoin(right: DataFrame, by: String): DataFrame
fun DataFrame.semiJoin(right: DataFrame, by: Iterable<Pair<String, String>>): DataFrame
fun DataFrame.semiJoin(right: DataFrame, by: Iterable<String> = defaultBy(this, right), suffices: Pair<String, String> = ".x" to ".y"): DataFrame

separate

Given either regular expression or a vector of character positions, separate() turns a single character column into multiple columns.

fun DataFrame.separate(column: String, into: List<String>, sep: String = "[^\\w]", remove: Boolean = true, convert: Boolean = false): DataFrame

setNames

Replace current column names with new ones. The number of provided names must match the number of columns.

fun DataFrame.setNames(vararg newNames: String): DataFrame

shuffle

Randomize the row order of a data-frame.

fun DataFrame.shuffle(): DataFrame

slice

Select rows by position while taking into account grouping in a data-frame.

fun DataFrame.slice(vararg slices: Int): DataFrame
fun DataFrame.slice(slice: IntRange): DataFrame

sortedBy

fun DataFrame.sortedBy(sortExpression: SortExpression): DataFrame
fun DataFrame.sortedBy(vararg sortExpressions: SortExpression): DataFrame

spread

Spread a key-value pair across multiple columns.

fun DataFrame.spread(key: String, value: String, fill: Any? = null, convert: Boolean = false): DataFrame

summarize

fun DataFrame.summarize(name: String, tableExpression: TableExpression): DataFrame

summarizeAt

fun DataFrame.summarizeAt(columnSelect: ColumnSelector, vararg aggfuns: AggFun): DataFrame
fun DataFrame.summarizeAt(columnSelect: ColumnSelector, op: (SummarizeBuilder.() -> Unit)? = null): DataFrame

tail

fun DataFrame.tail(numRows: Int = 5): DataFrame

take

fun DataFrame.take(numRows: Int = 5): DataFrame

takeLast

fun DataFrame.takeLast(numRows: Int): DataFrame

toDoubleMatrix

fun DataFrame.toDoubleMatrix(): Array<DoubleArray>

toFloatMatrix

fun DataFrame.toFloatMatrix(): Array<FloatArray>

toMap

Expose a view on the data as map from column names to nullable arrays.

fun DataFrame.toMap(): Map<String, Array<*>>

unfold

fun <T> DataFrame.unfold(columnName: String, properties: List<String> = detectPropertiesByReflection<T>().map { it.name }, keep: Boolean = true): DataFrame

unite

Convenience function to paste together multiple columns into one.

fun DataFrame.unite(colName: String, which: List<String>, sep: String = "_", remove: Boolean = true): DataFramefun DataFrame.unite(colName: String, vararg which: ColNames.() -> List<Boolean?>, sep: String = "_", remove: Boolean = true): DataFrame

unnest

If you have a list-column, this makes each element of the list its own row. It unfolds data vertically. unnest() can handle list-columns that can atomic vectors, lists, or data frames (but not a mixture of the different types).

fun DataFrame.unnest(columnName: String): DataFrame

writeCSV

fun DataFrame.writeCSV(file: File, format: CSVFormat = CSVFormat.DEFAULT.withHeader(*names.toTypedArray())): Unit

writeExcel

fun DataFrame.writeExcel(filePath: String, sheetName: String, headers: Boolean = true, eraseFile: Boolean = false, boldHeaders: Boolean = true): Unit

writeTSV

fun DataFrame.writeTSV(file: File, format: CSVFormat = CSVFormat.TDF.withHeader(*names.toTypedArray())): Unit

Companion Object Extension Functions

builder

Create a new data frame in place.

fun DataFrame.Companion.builder(vararg header: String): InplaceDataFrameBuilder

fromJson

fun DataFrame.Companion.fromJson(file: File): DataFrame
fun DataFrame.Companion.fromJson(fileOrUrl: String): DataFrame
fun DataFrame.Companion.fromJson(url: URL): DataFrame

fromJsonString

fun DataFrame.Companion.fromJsonString(jsonData: String): DataFrame

fromRecords

Create a data-frame from a list of objects

fun <T> DataFrame.Companion.fromRecords(records: Iterable<T>, mapping: (T) -> DataFrameRow): DataFrame

fromResultSet

fun DataFrame.Companion.fromResultSet(rs: ResultSet): DataFrame

readCSV

fun DataFrame.Companion.readCSV(fileOrUrl: String, format: CSVFormat = CSVFormat.DEFAULT.withHeader(), colTypes: Map<String, ColType> = mapOf()): DataFrame
fun DataFrame.Companion.readCSV(file: File, format: CSVFormat = CSVFormat.DEFAULT.withHeader(), colTypes: Map<String, ColType> = mapOf()): DataFrame

readDelim

fun DataFrame.Companion.readDelim(uri: URI, format: CSVFormat = CSVFormat.DEFAULT.withHeader(), isCompressed: Boolean = uri.toURL().toString().endsWith(".gz"), colTypes: Map<String, ColType> = mapOf()): DataFrame
fun DataFrame.Companion.readDelim(inStream: InputStream, format: CSVFormat = CSVFormat.DEFAULT.withHeader(), isCompressed: Boolean = false, colTypes: Map<String, ColType> = mapOf()): DataFrame
fun DataFrame.Companion.readDelim(reader: Reader, format: CSVFormat = CSVFormat.DEFAULT.withHeader(), colTypes: Map<String, ColType> = mapOf(), skip: Int = 0): DataFrame

readExcel

Returns a DataFrame with the contents from an Excel file sheet

fun DataFrame.Companion.readExcel(path: String, sheet: Int = 0, cellRange: CellRangeAddress? = null, colTypes: Map<String, ColType> = mapOf(), trim_ws: Boolean = false, guessMax: Int = 100, na: String = MISSING_VALUE, stopAtBlankLine: Boolean = true, includeBlankLines: Boolean = false): DataFrame
fun DataFrame.Companion.readExcel(path: String, sheet: String, cellRange: CellRangeAddress? = null, colTypes: Map<String, ColType> = mapOf(), trim_ws: Boolean = false, guessMax: Int = 100, na: String = MISSING_VALUE, stopAtBlankLine: Boolean = true, includeBlankLines: Boolean = false): DataFrame

readTSV

fun DataFrame.Companion.readTSV(fileOrUrl: String, format: CSVFormat = CSVFormat.TDF.withHeader(), colTypes: Map<String, ColType> = mapOf()): DataFrame
fun DataFrame.Companion.readTSV(file: File, format: CSVFormat = CSVFormat.TDF.withHeader(), colTypes: Map<String, ColType> = mapOf()): DataFrame