// %useLatestDescriptors 
%use dataframe(0.12.1)
%use kandy(0.5.0)
@file:DependsOn("org.xerial:sqlite-jdbc:3.45.1.0")  // SQLite database driver JDBC
@file:DependsOn("org.jetbrains.kotlinx:kotlinx-serialization-json:1.6.2")  // JSON serialization
@file:DependsOn("org.ktorm:ktorm-core:3.6.0")  // database query tool & ORM
@file:DependsOn("org.ktorm:ktorm-support-sqlite:3.6.0")  // database query tool & ORM
@file:DependsOn("com.github.doyaaaaaken:kotlin-csv-jvm:1.9.3")  // better CSV reader than DataFrame built-in

listOf<Long>(1).toDataFrame().single()

import java.sql.Connection
import java.sql.DriverManager
import java.sql.ResultSet
import java.sql.SQLException
import java.sql.Statement
import org.jetbrains.kotlinx.dataframe.io.DatabaseConfiguration

val penguinsConn : Connection = DriverManager.getConnection("jdbc:sqlite:penguins.db")

DataFrame.readSqlTable(penguinsConn, "little_penguins")

// we can run queries, rather than pulling an entire table
// when dealing with large tables, you'll almost always want to do some filtering or grouping in the database, rather than pulling the entire table
// you can also build views in your database to manipulate data, 
//   and you can also use external DB/ORM tools (ktorm, jOOQ, Exposed, etc) to build more complex queries and pull the data
// personal suggestion is to use the built-in `readSQL` functions only for very basic queries 
DataFrame.readSqlQuery(penguinsConn, "select species, island, sex from little_penguins")

// in this initial pull, we don't know the schema and the result is untyped (DataFrame<*>)
val dfPenguinsNaive : DataFrame<*> = DataFrame.readSqlTable(penguinsConn, "little_penguins")

dfPenguinsNaive.select { cols("species","island","sex") }

// here, we define a data class manually and annotate with `@DataSchema` which allows DataFrame to use it as a typed schema

@DataSchema
data class Penguin (   // can also be an interface
    val species	: String,
    val island : String,
    val bill_length_mm : Double?,
    val bill_depth_mm : Double?,
    val flipper_length_mm : Int?,
    val body_mass_g	: Int?,
    val sex : String?
)

val dfPenguins : DataFrame<Penguin> = DataFrame.readSqlTable(penguinsConn, "penguins") as DataFrame<Penguin>
dfPenguins.describe()

// hey wait a second, this schema doesn't match what we defined! the source SQLite database defines every column as TEXT (no numeric columns)
//   I would have expected `as DataFrame<Penguin>` to throw an error, but it's silent
// however, we can use `convertTo()` to get our schema corrected -- note the `type` column

val dfPenguins : DataFrame<Penguin> = DataFrame.readSqlTable(penguinsConn, "penguins").convertTo<Penguin>()
dfPenguins.describe()

val dfLittlePenguins : DataFrame<Penguin> = DataFrame.readSqlTable(penguinsConn, "little_penguins") as DataFrame<Penguin>

dfLittlePenguins
    .select { species and sex and island }
    .sortBy { island and sex.desc() }

dfPenguins
    .select { species and sex and island }
    .sortBy { species and sex and island }
    .take(10)

dfPenguins
    .select { species and sex and island }
    .sortBy { species and sex and island }
    .drop(3)
    .take(10)

// `take` is the equivalent of SQL limit -- the result set will not exceed the designated number of rows

// head() will also present a preview of a designated number of rows

dfPenguins
    .distinct { species and sex and island }

dfPenguins
    .distinct { species and sex and island }
    .filter { island == "Biscoe" }

dfPenguins
    .distinct { species and sex and island }
    .filter { island == "Biscoe" && sex != "MALE" } // note: this includes sex=NULL, SQL does not

// the given example is not very complex, here's another

dfPenguins
    .filter { (species == "Adelie" && (bill_depth_mm ?: 0.0) >= 21.0) || 
                (island == "Biscoe" && sex == "FEMALE" && ((body_mass_g ?: 0) > 5000 || (bill_length_mm ?: 0.0) > 48.0)) }

// we can incorporate any Kotlin features/functions

// this would likely be represented in SQL with a series of CASE...WHEN...THEN statements

dfPenguins
    .filter { when(species) {
        "Adelie" -> (body_mass_g ?: 0) > 4500 
        "Chinstrap" -> (body_mass_g ?: 0) > 4600 
        "Gentoo" -> (body_mass_g ?: 0) > 5750 
        else -> true
        }
    }

// instead of dealing with NULLs inline, we could 
//   filter out all rows with null values, or use fillNA() to substitute zeroes for nulls 

dfPenguins
    .add("flipper_length_cm") { flipper_length_mm?.div(10.0) }  // `add` is to add a column (not adding numbers)
    .add("body_mass_kg") { body_mass_g?.div(1000.0) }
    .select { "flipper_length_cm" and "body_mass_kg" }
    .take(3)

// dataframe requires all new columns to have a name (see above)
// alternatively, we can define the column type ahead of time, then work with typed column objects
//   strings are fine when accessing new columns once, but defined columns are preferred when referencing them often 

val flipperLengthCM by column<Double?>()
val bodyMassKG by column<Double?>()
val whereFound by column<String>()

dfPenguins
    .add(flipperLengthCM) { flipper_length_mm?.div(10.0) }
    .add(bodyMassKG) { body_mass_g?.div(1000.0) }
    .rename { island }.into(whereFound)
    .select { flipperLengthCM and bodyMassKG and whereFound }
    .dropNulls { bodyMassKG }
    .filter { bodyMassKG()!! < 4.0 }  // note the `()` -- short for `invoke()`

// the `invoke()` is usually needed in filters and when looking at the value of a column, 
//   not needed in select, rename, move, etc, when operating on the DataColumn itself

// since Kotlin enforces nullability, there is no way to NOT deal with nulls -- this is identical to #11 above, without the filter

// the columns were defined in the above cell

dfPenguins
    .add(flipperLengthCM) { flipper_length_mm?.div(10.0) }
    .add(bodyMassKG) { body_mass_g?.div(1000.0) }
    .rename { island }.into(whereFound)
    .select { flipperLengthCM and bodyMassKG and whereFound }
    .take(5)

// one option is to use fillNulls() or fillNA() to replace NULL with 0

// BE CAREFUL! null is not zero, and you may get results that shouldn't be part of the data set

dfPenguins
    .fillNA { flipper_length_mm and body_mass_g }.with { 0 }
    .add(flipperLengthCM) { flipper_length_mm?.toDouble()?.div(10.0) }
    .add(bodyMassKG) { body_mass_g?.toDouble()?.div(1000.0) }
    .rename { island }.into(whereFound)
    .select { flipperLengthCM and bodyMassKG and whereFound }
    .filter { bodyMassKG()!! < 4.0 }  // this will not filter the rows which were originally NULL, because they are now 0.0
    //.take(5)

dfPenguins
    .distinct { species and sex and island }
    .filter { island == "Biscoe" }

dfPenguins
    .distinct { species and sex and island }
    .filter { island == "Biscoe" && sex == "FEMALE" }

// be careful, null equality works differently in Kotlin than in SQL

dfPenguins
    .distinct { species and sex and island }
    .filter { island == "Biscoe" && sex != "FEMALE" }   // the SQL example does NOT include the null sex record

null == null  // this is null, not true, in SQL ... anything compared to null returns null

true

dfPenguins
    .select { species and sex and island }
    .filter { sex == null }

dfPenguins.sumFor { body_mass_g }

dfPenguins.aggregate { 
    max { bill_length_mm } into "longestBill"
    min { flipper_length_mm } into "shortestFlipper"
    (mean { bill_length_mm } / mean { bill_depth_mm }) into "weirdRatio"
}

// if you just want the aggregated values, you may prefer a different collection type, rather than a dataframe

mapOf<String, Double> (
    "longestBill" to dfPenguins.max { bill_length_mm },
    "shortestFlipper" to dfPenguins.min { flipper_length_mm }.toDouble(),
    "weirdRatio" to dfPenguins.mean { bill_length_mm } / dfPenguins.mean { bill_depth_mm }
)

{longestBill=59.6, shortestFlipper=172.0, weirdRatio=2.5608708253064427}

dfPenguins.aggregate { 
    count() into "countStar"
    count { sex() != null } into "countSpecific"
    valueCounts { sex }.count() into "distinctCount"
}

// as a Map

mapOf<String, Int> (
    "countStar" to dfPenguins.count(),
    "countSpecific" to dfPenguins.count { sex != null },
    "distinctCount" to dfPenguins.valueCounts { sex }.count()
)

{countStar=344, countSpecific=333, distinctCount=2}

// valueCounts can auto-group then count

dfPenguins.valueCounts { species and sex }

/*

select species, sex, count(*)
from penguins
group by species, sex

*/

// dataframe automatically returns the grouped column (SQL does not, unless specified)

dfPenguins
    .groupBy { sex }
    .mean { body_mass_g }

// this is a mistake, don't do it

// SQL has different keywords -- WHERE and HAVING -- for filtering before, or after, the aggregation
// Kotlin dataframe just has filter() for both, which can be confusing

// dataframe processes statements in order, so a filter() before grouping is like WHERE
//   and a filter() after grouping is like HAVING

// note: SQL does NOT process things top-to-bottom -- more like FROM (including JOINs), WHERE, GROUP BY, SELECT, HAVING, ORDER BY

dfPenguins
    .groupBy { sex }
    .mean { body_mass_g }
    .filter { "body_mass_g"<Double>() > 4000.0 } // referring to a new, aggregated column, so we must refer to it by string

// important to recognize that after the grouping, even though the mean column is titled "body_mass_g" we cannot refer to it directly
// body_mass_g still refers to the underlying individual data points
// however referring to the new column by its string title will work

// we can define the aggregated column ahead of time, ensuring the new column is renamed (and not deal with string column names)

// note that the order of the output is arbitrary unless specified with sortBy() (ORDER BY in SQL)

val avgBodyMass by column<Double>()

dfPenguins
    .groupBy { sex }
    .aggregate { 
        mean { body_mass_g } into (avgBodyMass)
    }
    .filter { avgBodyMass > 4000.0 }

// the "100 SQL Queries" source uses a SQLite-only syntax, `filter` keyword within in the SELECT statement

val avgBodyMass by column<Double>()

dfPenguins
    .dropNulls { body_mass_g }
    .filter { body_mass_g!! < 4000.0 }
    .groupBy { sex }
    .aggregate { 
        mean { body_mass_g } into (avgBodyMass)
    }

// wait a second, our result includes MALE = 3729.6, the source has 3752.5 -- why the difference?

// remember when we noted that the source data had all its data as TEXT, not numeric fields?

// when comparing text, "4000" < "4000.0"

// therefore, the source result set is actually including all the rows where body_mass_g is exactly 4000

dfPenguins
    .dropNulls { body_mass_g }
    .filter { body_mass_g!! <= 4000 } // gte result = 3752.54 (59 records), gt only result = 3729.62 (54 records)
    .filter { sex == "MALE" }
    .describe()

// when we use `body_mass_g!! <= 4000` the mean for body_mass_g matches the source result
// when we use `body_mass_g!! < 4000` the mean for body_mass_g matches our result

// since the source rounds the result to 1 decimal place, we'll do that here

// rounding to an arbitrary number of digits is not natively supported in Kotlin, however, we can write an extension function

fun Double.round(decimals: Int): Double {
    var multiplier = 1.0
    repeat(decimals) { multiplier *= 10 }
    return round(this * multiplier) / multiplier
}

dfPenguins
    .dropNulls { body_mass_g }
    .filter { body_mass_g!! < 4000.0 }
    .groupBy { sex }
    .aggregate { 
        mean { body_mass_g } into (avgBodyMass)
    }
    .update(avgBodyMass).with { it.round(1) }

// build df directly from data

val dfJob = dataFrameOf("name", "billable") (
    "calibrate", 1.5,
    "clean", 0.5
)
dfJob

// create columns, then assemble into df

val person by columnOf<String>("mik","mik","mik","po","po","tay")
val job by columnOf<String>("calibrate","clean","complain","clean","complain","complain")

val dfWork = listOf(person, job).toDataFrame()
dfWork

dfWork
    .update { person }
    .where{ it == "tay" }
    .with { "tae" }

// note one important difference between SQL and Kotlin dataframes 
// SQL is an interface to a persistent data store -- every INSERT/UPDATE/DELETE/DROP/CREATE/etc operation makes persistent changes

// in the source SQL, after performing the UPDATE query changing "tay" to "tae", the new name is persistent
//   there is nobody named "tay" anymore
// with dataframe, we are dealing with immutable objects -- we can perform some manipulation in a chained statement, 
//   but unless we assign the output to a new variable, those changes do not persist at all

dfWork  // does not reflct the changes we jkust made above

// only if we assign a new variable, will the changes persist

val dfWorkUpdated : DataFrame<*> = dfWork.update { person }.where{ it == "tay" }.with { "tae" }
dfWorkUpdated

// note we are dropping the original "tay" (not "tae") because the `update` above did not persist (and this `drop` will not persist either)
dfWork.drop { person == "tay" }

// these new variables are not truly "persistent" (not sved on disk anywhere) but can be referenced

val dfWorkTayBackup = dfWork.filter { person == "tay" }
val dfWorkExcludeTay = dfWork.drop { person == "tay" }

dfWorkTayBackup

dfWorkExcludeTay

dfWorkExcludeTay.writeCSV("./dataframeWorkExample")  // save to disk as CSV

DataFrame.read("./dataframeWorkExample")  // read from CSV

val dfWorkSchema = listOf(person, job).toDataFrame().take(0) // copy the schema but not the data
dfWorkSchema

dfWork.xs("tay") { person }

dfWorkBackup = dfWorkBackup.concat(
    dfWork.filter { person == "tay" }
)
dfWorkBackup

Line_78.jupyter.kts (1:1 - 13) Unresolved reference: dfWorkBackup
Line_78.jupyter.kts (1:16 - 28) Unresolved reference: dfWorkBackup
Line_78.jupyter.kts (4:1 - 13) Unresolved reference: dfWorkBackup

// using a `var`, this would allow you to replace the dataframe with a modified one after an operation

var dfWorkVar = dfWork.filter { true }
dfWorkVar = dfWorkVar.update(person).with { it.uppercase() }.sortBy { job and person }  // re-assigning the variable
dfWorkVar

dfWork.fullJoin(dfJob)

// simple syntax

dfWork.innerJoin(dfJob) { job match dfJob.name }

// note that the dataframe result, only returns one of the matched columns (job, but not name) 
//   because the join condition ensures they are identical

// alternative syntax, this allows for more complex join conditions
// `it` refers to initial df and `right` refers to the joined df

dfWork.innerJoinWith(dfJob) { it.job == right.name }

dfWork
    .innerJoin(dfJob) { job match dfJob.name }
    .groupBy { person }
    .aggregate {
        sum { "billable"<Double>() } into ("pay")
    }

dfWork
    .leftJoin(dfJob) { job match dfJob.name }

dfWork
    .leftJoin(dfJob) { job match dfJob.name }
    .groupBy { person }
    .aggregate {
        sum("billable") into ("pay")
        mean("billable") into ("pay_avg")
    }    

// note that in this case, dataframe auto-coalesces null values to 0 in the SUM aggregation, but not the MEAN
//   you will probably run into issues unless you specifically decide what to do with null values (filter them out, assign a value, etc)

// here, we at least specify the default value in case of a null

dfWork
    .leftJoin(dfJob) { job match dfJob.name }
    .groupBy { person }
    .aggregate {
        sumOf { "billable"<Double?>() ?: 0.80 } into ("pay")
        meanOf { "billable"<Double?>() ?: 0.80 } into ("pay_avg")
    }

// in SQL, all columns, including un-`select`-ed ones, are available for joins, where, etc
//   dataframe processes instructions sequentially, so if you select columns initially, the excluded ones aren't available for `filter`

/*
error:

dfWork
    .distinct { person }
    .filter { it.job != "calibrate" }
    
*/


// as the source says, this is not the result we wanted, because there is a row where 'mik' does 'calibrate'

dfWork
    .filter { job != "calibrate" }
    .distinct { person }

dfWork
    .filter { ! listOf("mik", "tay").contains(person) }

// using `in` infix

dfWork
    .filter { person !in listOf("mik", "tay") }

// similar to SQL, nested subqueries can sometimes be tricky to read

dfWork
    .filter { person !in 
                dfWork.filter { job == "calibrate" }.getColumn { person } }
    .distinct { person }

// generally, it makes more logical sense to write sub-queries individually (similar to SQL CTEs), 
//   especially given a programming language's natural ability to maintain variables, rather than nesting subqueries inside-out, as is common in SQL

val colPerson = dfWork    
    .filter { job == "calibrate" }
    .getColumn { person }

dfWork
    .filter { person !in colPerson }
    .distinct { person }

// using `let` to avoid creating a new variable

dfWork    
    .filter { job == "calibrate" }
    .getColumn { person }.let { calibraters ->

        dfWork
            .filter { person !in calibraters }
            .distinct { person }
            
}

// dataframe does not have a concept of primary key, nor will it enforce uniqueness

// it also does not have built-in sequences or auto-increment fields

val colName by columnOf<String>("mik","po","tay")
val ident by column<Long>()

// here's an emulation
val dfPerson = listOf(colName)
                .toDataFrame()
                .insert(ident) { it.index() + 1 }.at(0)
                // .addId("id")  is an alternative, however, you cannot +1 to create a 1-based index, and can't use a column variable
dfPerson

// .addId() will automatically create a first column with indexed values (0-based)

listOf(colName)
    .toDataFrame()
    .addId("ident")

// dataframe does not enforce any uniqueness constraint

dfPerson.append(2, "frank")

dfJob  // original

dfJob.add(ident) { if (name == "calibrate") 1 else if (name == "clean") 2 else 0 }

// if you had a lot of possible values:

val identMap = mapOf(
    "calibrate" to 1,
    "clean" to 2
    // ... lots more values
)

dfJob.add(ident) { identMap[name] }

// `when` is another option

val dfNewWork = dfPerson
    .innerJoin(dfWork) { colName match dfWork.person }
    .innerJoin(dfJob.add(ident) { identMap[name] }) { job match dfJob.name }
    .rename(ident).into("person_id")
    .rename("ident1").into("job_id")
    .select { "person_id" and "job_id" }

dfNewWork

// DROP TABLE doesn't really have an equivalent

dfPenguins.mean { body_mass_g }.let { bodyMassAvg ->
    dfPenguins
        .dropNulls { body_mass_g }
        .filter { body_mass_g!! > bodyMassAvg }
}.select { body_mass_g!! }
.take(5)

val bodyMassGroupAvg by column<Double>()

dfPenguins.innerJoin(
    dfPenguins
        .groupBy { species }
        .aggregate { mean { body_mass_g } into (bodyMassGroupAvg) }
).dropNulls { body_mass_g }
 .filter { body_mass_g!! > bodyMassGroupAvg() }
 .select { species and body_mass_g!! and bodyMassGroupAvg }
 .update { bodyMassGroupAvg }.with { it.round(1) }
 .take(5)

// CTEs are essentially the same as pre-defining variables with sub-selects

val dfGroupedPenguins = dfPenguins
        .groupBy { species }
        .aggregate { mean { body_mass_g } into ("bodyMassGroupAvg") }

dfPenguins
    .innerJoin(dfGroupedPenguins)
    .dropNulls { body_mass_g }
    .filter { body_mass_g!! > "bodyMassGroupAvg"<Double>() }
    .select { species and body_mass_g!! and "bodyMassGroupAvg" }
    .take(5)

dfPenguins
    .insert("rowID") { index() + 1 }.at(0)
    .select { "rowID" and species and island }
    .take(5)

dfPenguins
    .select { species and island }
    .addId("rowID")
    .take(5)

// dataframe doesn't need dedicated keywords, just standard Kotlin

dfPenguins
    .dropNulls { body_mass_g }
    .convert { body_mass_g.castToNotNullable() }.with { if (it < 3500) "small" else "large"}
    // .convert { body_mass_g }.with { if ((it ?: 9999) < 3500) "small" else "large"}
    .rename { body_mass_g }.into("size")
    .groupBy { species and "size"}
    .count()
    .sortBy { species and "count" }

// counts below are slightly different than source SQL because we filtered the nulls
// the source SQL is classifying the NULL values as "large" (because NULL is not < 3500, so it falls to the `else`)

// just another reason to decide how to handle nulls ahead of time

dfPenguins
    .dropNulls { body_mass_g }
    .convert { body_mass_g.castToNotNullable() }.with { 
        when {
            it < 3500 -> "small"
            it < 5000 -> "medium"
            // ... especially useful when there are lots of cases
            // a map also works, as we demonstrated in #41
            else -> "large"
        }
    }
    .rename { body_mass_g }.into("size")
    .groupBy { species and "size"}
    .count()
    .sortBy { species and "count" }

dfPenguins
    .dropNulls { it.body_mass_g }
    .convert { body_mass_g.castToNotNullable() }.with { 
        when {
            it in (3500..5000) -> "normal"  // `it` refers to body_mass_g
            else -> "abnormal"
        }
    }
    .rename { body_mass_g }.into("size")
    .groupBy { species and "size"}
    .count()
    .sortBy { species and "count" }

val assaysConn : Connection = DriverManager.getConnection("jdbc:sqlite:assays.db")
val dfStaff = DataFrame.readSqlTable(assaysConn, "staff")
dfStaff

// Kotlin doesn't have keywords `LIKE` or `GLOB` (although GLOB works in SQLite, it is not standard SQL)
//   but with Regex we can create much more complex matchers if necessary
// We also have every other Kotlin string function (and numeric functions for numeric columns)
//   and, we can define our own functions

dfStaff
    .filter { Regex("ya").containsMatchIn(personal) || Regex("De").containsMatchIn(family) }
    .select { personal and family }

// defining a custom function
fun startsWithK(test : String) = test.startsWith("K")

dfStaff
    .filter { startsWithK(family) }
    .select { personal and family }

val dfExperiment = DataFrame.readSqlTable(assaysConn, "experiment")

// this also uses `concat` like SQL UNION

dfExperiment.sortBy { started }.head(5)
    .concat(dfExperiment.sortBy { started }.tail(5))
    .sortBy { started }

// intersect isn't part of the dataframe API, but it exists in Kotlin, we just need to convert each df to an Iterable with `rows()`
//   then back to a df post-intersect

// note this (either SQL or df) is only usable when the two data sources have the exact same schema

dfStaff.filter { age < 50 }.rows()
    .intersect(dfStaff.filter { dept == "mb" }.rows())
    .toDataFrame()
    .select { personal and family and dept and age }

// `except` can very often be replaced more effectively by combining filters

dfStaff
    .filter { it.dept == "mb" }
    .filter { ! (it.age < 50) }
    .select { personal and family and dept and age }

// Kotlin's `minus` works like SQL except

// just like `intersect`, we are comparing collections of rows, rather than directly in the dataframe itself

dfStaff.filter { dept == "mb" }.rows()
    .minus(dfStaff.filter { age < 50 }.rows())
    .toDataFrame()
    .select { personal and family and dept and age }

// just using Kotlin standard Random functions

import kotlin.random.Random

dfStaff
    .add {
        "rand" from { Random.nextLong() }
        "selector" from { Random.nextInt(from = 0, until = 9) }
        "name" from { "${it.personal} ${it.family}" }
    }
    .filter { "selector"<Int>() < 5 }
    .select { "rand" and "selector" and "name" }

val mySeries by IntRange(1, 5).toList().toColumn()
mySeries

IntRange(0, 20).step(2).reversed().toList().toColumn().named("evenNumbers")

// creating a simple df with min value = 1, max value = 15

val dfTemp = dataFrameOf(
    "num" to listOf(1, 15)
)
dfTemp

// use the above df to create a sequence based on min & max

val series by IntRange(dfTemp.minOf{ "num"<Int>() }, dfTemp.maxOf{ "num"<Int>() }).toList().toColumn()
series

import java.time.LocalDate
import java.time.temporal.ChronoUnit


dfExperiment.minOf{ started }

2023-01-29

IntRange(0, dfExperiment.count())
    .map { LocalDate.parse(dfExperiment.minOf{ started }).plusDays(it.toLong()) }
    .toColumn("someDays")
    .take(5)

// create a range of every day (all 371) between the column's min and max values

val colAllDays : DataColumn<LocalDate> = 
    LongRange(0L, ChronoUnit.DAYS.between(
                LocalDate.parse(dfExperiment.minOf{ started }), 
                LocalDate.parse(dfExperiment.maxOf{ started })
        ))
        .map { LocalDate.parse(dfExperiment.minOf{ started }).plusDays(it) }
        .toColumn("day")

colAllDays  // 371 rows

colAllDays.toDataFrame().leftJoinWith(
    dfExperiment
        .groupBy { started }
        .aggregate {
            count() into "experimentCount"
        }
        .convert { started }.with { LocalDate.parse(it) }
    ) { it["day"] == right["started"] } // "it" refers to first/left df, "right" refers to second/right
.select { "day" and "experimentCount" }
.fillNulls("experimentCount").with { 0 }

// 2 days with >1 experiment  -- 2023-09-07 and 2024-01-19

// simulating the firt CTE 'person'

val dfStaffPersons = dfStaff
    .add("name") { "${it.personal} ${it.family}" }
    .select { ident and "name" }

dfStaffPersons

dfStaffPersons
    .joinWith(dfStaffPersons) { true }  // cross join, every row on left joined with every row on right, no match necessary
    .select { "name" and "name1" }
    .head(20)

// note the syntax here -- 
// in the filter, the original `ident` column is still accessible, but the new `ident1` (created as a result of the join) must be referred to by String
// same is true in the final select with `name` and `name1`

dfStaffPersons
    .joinWith(dfStaffPersons) { it.ident!! < right.ident!! } // this cross join has conditions, the left ident must be less than the right ident
    .filter { ident() <= 4 && "ident1"<Int>() <= 4 }  
    .select { name and "name1" }

// it appears each experiment had up to 2 people work on it
// this subquery matches up each experiment with its two staffers

val dfPerformed = DataFrame.readSqlTable(assaysConn, "performed")

val dfTogether = dfPerformed
    .innerJoin(dfPerformed) { "experiment"<Int>() }
    .filter { "staff"<Int>() < "staff1"<Int>() }
    .move("experiment").toLeft()

dfTogether

// based on the subquery above, we replace the id's with the staffer's name to get a list of all staff pairs who worked on an experiment together

dfTogether
    .innerJoinWith(dfStaffPersons) { it["staff"] == right["ident"] }
    .innerJoinWith(dfStaffPersons) { it["staff1"] == right["ident"] }
    .select { "name" and "name1" }
    .rename("name").into("person_1")
    .rename("name1").into("person_2")

val dfDepartment = DataFrame.readSqlTable(assaysConn, "department")
dfDepartment

dfStaff

// some of the source examples use rather convoluted syntax -- often, standard joins are clearer and work better
// (see note after #65 in source, "avoiding correlated subqueries")

dfDepartment
    .filter { ident in dfStaff.dept }
    .select { name and building }

// same result with an inner join

dfDepartment.innerJoin(dfStaff) { ident match right.dept }
    .distinct { name and building }

dfDepartment
    .filter { ! (ident in dfStaff.dept) }
    .select { name and building }

// lead and lag, or shift and offset, are not part of the dataframe API yet, although an issue has been raised
dfExperiment

// first, we group each experiment into the month it was started, and generate a count for each month

val startMonth by column<LocalDate>()

val dfMonthlyCount = dfExperiment
    .add(startMonth) { LocalDate.parse(started).withDayOfMonth(1) } // converting start date to its month (by changing date to 1)
    .groupBy { startMonth }
    .count()
    .sortBy { startMonth }
    .rename("count").into("experiments")
    .addId()

dfMonthlyCount

// from df above, we can emulate lead/lag by left-joining on itself, by adding/subtracting 1 from the right-side id

dfMonthlyCount
    .leftJoinWith(dfMonthlyCount) { id == right.id + 1 }
    .leftJoinWith(dfMonthlyCount) { id == right.id - 1 }
    .select { startMonth and "experiments1" and experiments and "experiments2" }
    .rename("experiments1").into("prevMonth")
    .rename("experiments").into("currentMonth")
    .rename("experiments2").into("nextMonth")

// df API includes a cumulative sum function, however we need to calculate it independently then add it to our dataframe

// create the cumulative sum column
val cumulativeCount : DataColumn<Int> = dfMonthlyCount.experiments.cumSum().named("runningTotal")

dfMonthlyCount
    .add(cumulativeCount)  // add the column
    .add("progress") { "runningTotal"<Int>() / dfMonthlyCount.sumOf{experiments}.toDouble() } // calculate the progress %age inline

// probably better to calculate the sumOf once and assign it to a variable, as opposed to repeating the calculation for every row

// note that the `progress` below is based on a %age of the total (50 experiments), and is not linear

// in the above query, the calendar month was represented by a single date column
// on this one, we need a column for year and a separate column for month number

val theYear by column<Int>()
val theMonth by column<Int>()

val dfMonthlyCount2 = dfExperiment
                        .add(theYear) { LocalDate.parse(started).getYear() }
                        .add(theMonth) { LocalDate.parse(started).getMonthValue() }
                        .sortBy { theYear and theMonth }
                        .groupBy { theYear and theMonth }
                        .count()
                        .rename("count").into("experiments")

// val annualTotalsMap = dfMonthlyCount2
//                         .groupBy { theYear }
//                         .sumOf { "experiments"<Int>() }
//                         .associate { it.getValue<Int>("theYear") to it.getValue<Int>("sum") }

dfMonthlyCount2  // same data as dfMonthlyCount above

// this is where we get the key value of each partition (every distinct `year` value)

val years : Set<Int> = dfMonthlyCount2.distinct { theYear }.getColumn { theYear }.toSet() 
years

[2023, 2024]

// now we take each year, generate a data partition, do the cumulative count calculation on that partition
// and finally re-join all the partitions back into a single dataframe

years
    .map { year -> 
        val dfSubset = dfMonthlyCount2.filter { theYear == year } // creates a partition of data only for the current `year` value
        val cumulativeCount : DataColumn<Int> = dfSubset.experiments.cumSum().named("runningTotal")
        dfSubset.add(cumulativeCount)
    }.concat()

val labLogConn : Connection = DriverManager.getConnection("jdbc:sqlite:lab_log.db")
val dfMachine = DataFrame.readSqlTable(labLogConn, "machine")
dfMachine

// when importing relational data, Kotlin will treat a JSON data column as just a String

dfMachine.describe()

// parse is easiest, each JSON field generates a nested column

// note that none of these will automatically convert a source String to a LocalDate - that must be done manually

dfMachine
    .parse { details }
    .convert { details["acquired"] and details["refurbished"] }.with { it?.let { LocalDate.parse(it.toString()) } }

// convert() to DataRow.readJsonStr() gives the same hierarchal result

dfMachine
    .convert { details }.with { DataRow.readJsonStr(it) }
    .convert { details["acquired"] and details["refurbished"] }.with { it?.let { LocalDate.parse(it.toString()) } }

// with kotlinx.serialization, instead of creating individual columns for every field, we can create a single column with a `details` class

// every row's JSON has some null/unspecified values, so it's very messy data, we need to be extra-careful

import kotlinx.serialization.*
import kotlinx.serialization.json.*
import kotlinx.serialization.descriptors.PrimitiveKind
import kotlinx.serialization.descriptors.PrimitiveSerialDescriptor
import kotlinx.serialization.descriptors.SerialDescriptor
import kotlinx.serialization.encoding.Decoder
import kotlinx.serialization.encoding.Encoder


// normally we don't need this much setup, but kotlinx.serialization does not have a built-in LocalDate serializer
object LocalDateSerializer: KSerializer<LocalDate> {

    override val descriptor: SerialDescriptor =
        PrimitiveSerialDescriptor("LocalDateSerializer", PrimitiveKind.STRING)

    override fun serialize(encoder: Encoder, value: LocalDate) {
        encoder.encodeString(value.toString())
    }

    override fun deserialize(decoder: Decoder): LocalDate {
        return LocalDate.parse(decoder.decodeString())
    }

}

// this is the schema based on the JSON data in the "details" column of the "machine" source table
@Serializable
@DataSchema
data class MachineDetails (
    @Serializable(with= LocalDateSerializer::class) val acquired : LocalDate? = null,  // nullable because this field is sometimes missing
    @Serializable(with= LocalDateSerializer::class) val refurbished : LocalDate? = null,
    val note : String = ""
)

// our goal is to convert the imported JSON document to a native object

// `decodeFromString()` will create MachineDetails objects from the JSON string

// this will create just one `details` column, not 3 like the above examples

dfMachine
    .convert { details }.with { Json.decodeFromString<MachineDetails>(it) }

// once we have class instances in a column, we can access their native properties

dfMachine
    .convert { details }.with { Json.decodeFromString<MachineDetails>(it) }
    .add("acquiredYear") { "details"<MachineDetails>().acquired?.year }
    .select("acquiredYear")

val dfUsage = DataFrame.readSqlTable(labLogConn, "usage")
dfUsage

// `parse` is simplest, but you have less control over the output, and likely need some column variables

val mylog by frameColumn()
val firstObj by frameColumn()

dfUsage
    .parse { log named "mylog" }
    .add("length") { mylog().count() }
    .add(firstObj) { mylog().take(1) }
    .split(firstObj)

// a defined deserialization strategy (using kotlinx.serialization, or another library) provides native object creation

@Serializable
@DataSchema
data class UsageDetail (
    val machine: String,
    val person: List<String> = listOf()
)

dfUsage.add {
        "length" from { Json.decodeFromString<List<UsageDetail>>(log).size }
        "first" from { Json.decodeFromString<List<UsageDetail>>(log).first() }
    }.select { ident and "length" and "first" }

// instead of .decodeFromString() multiple times (above), you can perform it once and reference the result

dfUsage
    .convert { log named "udList" }.with { Json.decodeFromString<List<UsageDetail>>(it) } 
    .add {
        "length" from { "udList"<List<UsageDetail>>().size }
        "first" from { "udList"<List<UsageDetail>>().first() }
    }.select { ident and "length" and "first" }

dfUsage
    .convert { log named "udList" }.with { Json.decodeFromString<List<UsageDetail>>(it) }
    .split { "udList"<List<UsageDetail>>() }.intoRows()
    //.head(10)

dfUsage
    .add("lastMachine") { Json.decodeFromString<List<UsageDetail>>(log).last().machine }
    .select { ident and "lastMachine" }

// in Kotlin we use immutable objects so we can't just add arbitrary fields
// a convoluted workaround is to deal with JsonObjects rather than actual defined classes (like MachineDetails above)
//   even here, the JsonObject from kotlinx.serialization is immutable, we must convert to a MutableMap in order to add a new field
// a different JSON library (jackson, gson, moshi) is probably less strict about immutability

dfMachine
    .convert { details }.with{ (Json.parseToJsonElement(details) as JsonObject).let { originalJson ->
            JsonObject(originalJson.toMutableMap().apply {
                this["sold"] = JsonPrimitive("2024-01-25")
            })
        } 
    }

// every value in the `details` column now has a "sold" key attached to it

// this concept is more suited to a persistent data store, to permanently mark certain data as "inactive"
// (but now the user needs to filter on "active" on every query going forward)

// much better (less error-prone) to create a view that filters out the inactive records

dfPenguins
    .add("active") { species != "Adelie" }  // prefer to use a Boolean rather than 0/1 (SQL bit)
    .filter { "active"<Boolean>() }
    .groupBy { species }
    .count()

data class Job (
    val name: String,
    val billable: Double
) {
    init { require(billable > 0.0) { "billable must be greater than 0" } }
}

// will fail validation and raise error

listOf<Job>(
    Job("calibrate", 1.5), 
    Job("reset", -0.5)
)

billable must be greater than 0
java.lang.IllegalArgumentException: billable must be greater than 0
	at Line_188_jupyter$Job.<init>(Line_188.jupyter.kts:5)
	at Line_189_jupyter.<init>(Line_189.jupyter.kts:5)
	at java.base/jdk.internal.reflect.DirectConstructorHandleAccessor.newInstance(DirectConstructorHandleAccessor.java:62)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:502)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:486)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.evalWithConfigAndOtherScriptsResults(BasicJvmScriptEvaluator.kt:105)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke$suspendImpl(BasicJvmScriptEvaluator.kt:47)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke(BasicJvmScriptEvaluator.kt)
	at kotlin.script.experimental.jvm.BasicJvmReplEvaluator.eval(BasicJvmReplEvaluator.kt:49)
	at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl$eval$resultWithDiagnostics$1.invokeSuspend(InternalEvaluatorImpl.kt:107)
	at kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33)
	at kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:106)
	at kotlinx.coroutines.EventLoopImplBase.processNextEvent(EventLoop.common.kt:284)
	at kotlinx.coroutines.BlockingCoroutine.joinBlocking(Builders.kt:85)
	at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking(Builders.kt:59)
	at kotlinx.coroutines.BuildersKt.runBlocking(Unknown Source)
	at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking$default(Builders.kt:38)
	at kotlinx.coroutines.BuildersKt.runBlocking$default(Unknown Source)
	at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl.eval(InternalEvaluatorImpl.kt:107)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:73)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:71)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withHost(ReplForJupyterImpl.kt:646)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl.execute(CellExecutorImpl.kt:71)
	at org.jetbrains.kotlinx.jupyter.repl.execution.CellExecutor$DefaultImpls.execute$default(CellExecutor.kt:13)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evaluateUserCode(ReplForJupyterImpl.kt:480)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.access$evaluateUserCode(ReplForJupyterImpl.kt:125)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:378)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:375)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withEvalContext(ReplForJupyterImpl.kt:359)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evalEx(ReplForJupyterImpl.kt:375)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:124)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:123)
	at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$runExecution$execThread$1.invoke(JupyterExecutorImpl.kt:26)
	at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$runExecution$execThread$1.invoke(JupyterExecutorImpl.kt:21)
	at kotlin.concurrent.ThreadsKt$thread$thread$1.run(Thread.kt:30)

@DataSchema
data class ValidatingPenguin(
    val species	: String,
    val island : String,
    val bill_length_mm : Double,
    val bill_depth_mm : Double,
    val flipper_length_mm : Int,
    val body_mass_g	: Int,
    val sex : String
) {
    init { require(body_mass_g > 4000) { "body_mass must be greater than 4000" } }
}

// will fail validation and raise error
// eliminate the validation in the data class above and it will work fine

DataFrame
    .readSqlTable(penguinsConn, "little_penguins")
    .convertTo<ValidatingPenguin>()
    .mapToColumn("validatingPenguin") { ValidatingPenguin(species, 
                                                          island, 
                                                          bill_length_mm, 
                                                          bill_depth_mm, 
                                                          flipper_length_mm,
                                                          body_mass_g,
                                                          sex
                                                          ) }

body_mass must be greater than 4000
java.lang.IllegalArgumentException: body_mass must be greater than 4000
	at Line_190_jupyter$ValidatingPenguin.<init>(Line_190.jupyter.kts:11)
	at Line_192_jupyter$res192$1.invoke(Line_192.jupyter.kts:7)
	at Line_192_jupyter$res192$1.invoke(Line_192.jupyter.kts:7)
	at org.jetbrains.kotlinx.dataframe.impl.columns.ConstructorsKt.computeValues(constructors.kt:97)
	at org.jetbrains.kotlinx.dataframe.impl.columns.ConstructorsKt.newColumn(constructors.kt:59)
	at org.jetbrains.kotlinx.dataframe.api.MapKt.mapToColumn(map.kt:95)
	at Line_192_jupyter.<init>(Line_192.jupyter.kts:23)
	at java.base/jdk.internal.reflect.DirectConstructorHandleAccessor.newInstance(DirectConstructorHandleAccessor.java:62)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:502)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:486)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.evalWithConfigAndOtherScriptsResults(BasicJvmScriptEvaluator.kt:105)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke$suspendImpl(BasicJvmScriptEvaluator.kt:47)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke(BasicJvmScriptEvaluator.kt)
	at kotlin.script.experimental.jvm.BasicJvmReplEvaluator.eval(BasicJvmReplEvaluator.kt:49)
	at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl$eval$resultWithDiagnostics$1.invokeSuspend(InternalEvaluatorImpl.kt:107)
	at kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33)
	at kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:106)
	at kotlinx.coroutines.EventLoopImplBase.processNextEvent(EventLoop.common.kt:284)
	at kotlinx.coroutines.BlockingCoroutine.joinBlocking(Builders.kt:85)
	at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking(Builders.kt:59)
	at kotlinx.coroutines.BuildersKt.runBlocking(Unknown Source)
	at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking$default(Builders.kt:38)
	at kotlinx.coroutines.BuildersKt.runBlocking$default(Unknown Source)
	at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl.eval(InternalEvaluatorImpl.kt:107)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:73)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:71)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withHost(ReplForJupyterImpl.kt:646)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl.execute(CellExecutorImpl.kt:71)
	at org.jetbrains.kotlinx.jupyter.repl.execution.CellExecutor$DefaultImpls.execute$default(CellExecutor.kt:13)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evaluateUserCode(ReplForJupyterImpl.kt:480)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.access$evaluateUserCode(ReplForJupyterImpl.kt:125)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:378)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:375)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withEvalContext(ReplForJupyterImpl.kt:359)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evalEx(ReplForJupyterImpl.kt:375)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:124)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:123)
	at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$runExecution$execThread$1.invoke(JupyterExecutorImpl.kt:26)
	at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$runExecution$execThread$1.invoke(JupyterExecutorImpl.kt:21)
	at kotlin.concurrent.ThreadsKt$thread$thread$1.run(Thread.kt:30)

// typical example use case for transactions is a bank transfer
// need to ensure that the debit and credit both occur, otherwise someone will be receiving free money (or sending money to a black hole)

// let's set up an example bank with customers
@DataSchema
data class Friend (
    val name: String,
    val bankBalance: Int
)

val friends : List<Friend> = listOf(Friend("Monica", 200), Friend("Ross", 600), Friend("Phoebe", 1000), 
                                    Friend("Joey", 100), Friend("Rachel", 800), Friend("Chandler", 750))

val dfFriends : DataFrame<Friend> = friends.toDataFrame()

// now let's create a transfer function

// we have two validation checks -- that the transfers are both known customers, and the sender has sufficient funds

// our function performs the validation before executing any of the changes, ensuring they will both be a success
//   this is essentially the same as a transaction -- either everything succeeds or nothing changes

// of course, if there are un-anticipated errors that aren't caught in the validation, that would cause a problem
//   or, if these are lengthy operations and the data may have changed after the validation but before/during the updates

fun transferMoney(from: String, to: String, amount: Int) : DataFrame<*> {
    if (from !in dfFriends.name || to !in dfFriends.name) throw Exception("unknown customer")
    if (amount > dfFriends.single { name == from}.bankBalance) throw Exception("insufficient funds")
    return dfFriends
        .update { bankBalance }.where { name == from } .with { bankBalance - amount }
        .update { bankBalance }.where { name == to } .with { bankBalance + amount }
}

transferMoney("Chandler", "Joey", 200)  // works fine

transferMoney("Chandler", "Brad", 200) // who's Brad?

unknown customer
java.lang.Exception: unknown customer
	at Line_197_jupyter.transferMoney(Line_197.jupyter.kts:12)
	at Line_198_jupyter.<init>(Line_198.jupyter.kts:1)
	at java.base/jdk.internal.reflect.DirectConstructorHandleAccessor.newInstance(DirectConstructorHandleAccessor.java:62)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:502)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:486)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.evalWithConfigAndOtherScriptsResults(BasicJvmScriptEvaluator.kt:105)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke$suspendImpl(BasicJvmScriptEvaluator.kt:47)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke(BasicJvmScriptEvaluator.kt)
	at kotlin.script.experimental.jvm.BasicJvmReplEvaluator.eval(BasicJvmReplEvaluator.kt:49)
	at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl$eval$resultWithDiagnostics$1.invokeSuspend(InternalEvaluatorImpl.kt:107)
	at kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33)
	at kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:106)
	at kotlinx.coroutines.EventLoopImplBase.processNextEvent(EventLoop.common.kt:284)
	at kotlinx.coroutines.BlockingCoroutine.joinBlocking(Builders.kt:85)
	at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking(Builders.kt:59)
	at kotlinx.coroutines.BuildersKt.runBlocking(Unknown Source)
	at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking$default(Builders.kt:38)
	at kotlinx.coroutines.BuildersKt.runBlocking$default(Unknown Source)
	at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl.eval(InternalEvaluatorImpl.kt:107)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:73)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:71)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withHost(ReplForJupyterImpl.kt:646)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl.execute(CellExecutorImpl.kt:71)
	at org.jetbrains.kotlinx.jupyter.repl.execution.CellExecutor$DefaultImpls.execute$default(CellExecutor.kt:13)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evaluateUserCode(ReplForJupyterImpl.kt:480)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.access$evaluateUserCode(ReplForJupyterImpl.kt:125)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:378)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:375)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withEvalContext(ReplForJupyterImpl.kt:359)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evalEx(ReplForJupyterImpl.kt:375)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:124)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:123)
	at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$runExecution$execThread$1.invoke(JupyterExecutorImpl.kt:26)
	at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$runExecution$execThread$1.invoke(JupyterExecutorImpl.kt:21)
	at kotlin.concurrent.ThreadsKt$thread$thread$1.run(Thread.kt:30)

transferMoney("Monica", "Chandler", 500) // too much

insufficient funds
java.lang.Exception: insufficient funds
	at Line_197_jupyter.transferMoney(Line_197.jupyter.kts:13)
	at Line_199_jupyter.<init>(Line_199.jupyter.kts:1)
	at java.base/jdk.internal.reflect.DirectConstructorHandleAccessor.newInstance(DirectConstructorHandleAccessor.java:62)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:502)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:486)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.evalWithConfigAndOtherScriptsResults(BasicJvmScriptEvaluator.kt:105)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke$suspendImpl(BasicJvmScriptEvaluator.kt:47)
	at kotlin.script.experimental.jvm.BasicJvmScriptEvaluator.invoke(BasicJvmScriptEvaluator.kt)
	at kotlin.script.experimental.jvm.BasicJvmReplEvaluator.eval(BasicJvmReplEvaluator.kt:49)
	at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl$eval$resultWithDiagnostics$1.invokeSuspend(InternalEvaluatorImpl.kt:107)
	at kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33)
	at kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:106)
	at kotlinx.coroutines.EventLoopImplBase.processNextEvent(EventLoop.common.kt:284)
	at kotlinx.coroutines.BlockingCoroutine.joinBlocking(Builders.kt:85)
	at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking(Builders.kt:59)
	at kotlinx.coroutines.BuildersKt.runBlocking(Unknown Source)
	at kotlinx.coroutines.BuildersKt__BuildersKt.runBlocking$default(Builders.kt:38)
	at kotlinx.coroutines.BuildersKt.runBlocking$default(Unknown Source)
	at org.jetbrains.kotlinx.jupyter.repl.impl.InternalEvaluatorImpl.eval(InternalEvaluatorImpl.kt:107)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:73)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl$execute$1$result$1.invoke(CellExecutorImpl.kt:71)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withHost(ReplForJupyterImpl.kt:646)
	at org.jetbrains.kotlinx.jupyter.repl.impl.CellExecutorImpl.execute(CellExecutorImpl.kt:71)
	at org.jetbrains.kotlinx.jupyter.repl.execution.CellExecutor$DefaultImpls.execute$default(CellExecutor.kt:13)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evaluateUserCode(ReplForJupyterImpl.kt:480)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.access$evaluateUserCode(ReplForJupyterImpl.kt:125)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:378)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl$evalEx$1.invoke(ReplForJupyterImpl.kt:375)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.withEvalContext(ReplForJupyterImpl.kt:359)
	at org.jetbrains.kotlinx.jupyter.repl.impl.ReplForJupyterImpl.evalEx(ReplForJupyterImpl.kt:375)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:124)
	at org.jetbrains.kotlinx.jupyter.messaging.IdeCompatibleMessageRequestProcessor$processExecuteRequest$1$response$1$1.invoke(IdeCompatibleMessageRequestProcessor.kt:123)
	at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$runExecution$execThread$1.invoke(JupyterExecutorImpl.kt:26)
	at org.jetbrains.kotlinx.jupyter.execution.JupyterExecutorImpl$runExecution$execThread$1.invoke(JupyterExecutorImpl.kt:21)
	at kotlin.concurrent.ThreadsKt$thread$thread$1.run(Thread.kt:30)

fun upsertFriends(friend : Friend) : DataFrame<*> {
    return if (friend.name in dfFriends.name)
            dfFriends
                .update { bankBalance }.where { name == friend.name } .with { friend.bankBalance }
        else 
            dfFriends.append(friend.name, friend.bankBalance)
}

upsertFriends(Friend("Rachel", 888)) // updates

upsertFriends(Friend("Gunther", 125)) // inserts

// an equivalent to a database trigger would be a Kotlin function 
//   that performs a database operation and also some other procedure, potentially some side effect -- which can be tricky to trace/debug

@DataSchema
data class LabJob (
    val person: String,
    val reported: Double
) {
    init { require(reported > 0.0) { "reported must be greater than 0" } }
}

@DataSchema
data class LabTotals (
    val person: String,  // dataframe has no unique constraint
    val hours: Double
)

val dfLabJobs : DataFrame<LabJob> = listOf<LabJob>().toDataFrame<LabJob>()

val dfLabTotals : DataFrame<LabTotals> = 
    listOf<LabTotals>(LabTotals("gene", 0.0), LabTotals("august", 0.0)).toDataFrame<LabTotals>()

// theoretically, we can use something like this function to simulate a database trigger
// if we assign the result to a new variable (after chaining) then the totals would persist
// however it's far from perfect since the individual jobs (dfLabJobs) would not persist
fun <T> DataFrame<T>.addJob(job : LabJob) : DataFrame<T> {
    if (! this.getColumn(person).contains(job.person) ) throw Exception("unknown person")
        else {
            dfLabJobs.concat( listOf<LabJob>(job).toDataFrame<LabJob>() )
            return this.update { "hours"<Double>() }.where { "person"<String>() == job.person }.with { it + job.reported }
        }
}

dfLabTotals
    .addJob(LabJob("gene", 1.5))
    .addJob(LabJob("august", 0.5))
    .addJob(LabJob("gene", 1.0))
    // .addJob(LabJob("fred", 4.0)) // will cause an error since "fred" is unknown

// seeding the database just like the source `lineage` table 

@DataSchema
data class Lineage (
    val parent : String,
    val child : String
)

val dfLineage = listOf<Lineage>(
    Lineage("Arturo", "Clemente"),
    Lineage("Darío", "Clemente"),
    Lineage("Clemente", "Homero"),
    Lineage("Clemente", "Ivonne"),
    Lineage("Ivonne", "Lourdes"),
    Lineage("Soledad", "Lourdes"),
    Lineage("Lourdes", "Santiago")
).toDataFrame()

@DataSchema
data class Descendant (
    val person : String,
    val generation : Int = 0
)

dfLineage

// we can use `fold` to emulate a recursive query.
// the number of generations cannot exceed the number of people so that is our range to iterate, seeding with an empty list

// for the zero generation, we find the people who are never listed as a child

// for future generations, we filter on the current generation and find the children of those individuals, and add them to the list

(0..dfLineage.count()).fold( listOf<Descendant>() ) { descendants, i ->
    if (i == 0) {
        dfLineage.filter { parent !in dfLineage.child }.map { Descendant(it.parent, i) }
    } else {
        val parents : List<String> = descendants.filter { it.generation == i - 1 }.map { it.person }
        val children = dfLineage.filter { parent in parents } //.getColumn{ child }.toList()
        descendants + children.distinct { child }.map { Descendant(it.child, i) }
    }
}.toDataFrame<Descendant>()
.groupBy { person }.minOf { generation }

val contactTracingConn : Connection = DriverManager.getConnection("jdbc:sqlite:contact_tracing.db")
val dfPerson = DataFrame.readSqlTable(contactTracingConn, "person")
val dfContact = DataFrame.readSqlTable(contactTracingConn, "contact")
    .rename("left").into("leftCol")
    .rename("right").into("rightCol")

dfContact

// dataframe matches up by column name, not simply position
// even if we switch the order of the columns, when we .concat() the dataframe re-orders by column name

// here, the first 8 rows are the same as the bottom 8 rows even though we switched the order or the columns
//   (when performing a UNION in SQL, the order of the select takes priority over matching column names)
dfContact
    .concat(dfContact.move { rightCol }.toLeft())

// same thing with this syntax

dfContact.select { cols(0,1) }
    .concat(dfContact.select { cols(1,0) })

// we have to rename the columns or rebuild a df with matching names in order to successfully concat()

// this is a bit convoluted but also a pretty unusual operation

val dfBiContact = listOf(
    dfContact.getColumn(1).named("leftCol"),
    dfContact.getColumn(0).named("rightCol")
).toDataFrame()
    .concat(dfContact)

dfBiContact

dfPerson
    .innerJoinWith(dfBiContact) { it.name == right.leftCol }
    .innerJoinWith(dfPerson) { "rightCol"<String>() == right.name }
    .select { "leftCol" and "ident" and "rightCol" and "ident1" }
    .add("new_ident") { minOf("ident"<Int>(), "ident1"<Int>()) }

// we're not going to use Python however we can use this opportunity to demonstrate using an ORM
// these examples use ktorm (https://www.ktorm.org/), but there are many other options: jOOQ, komapper, Exposed, SQLDelight, kotliquery...

import org.ktorm.database.Database
import org.ktorm.database.use
import org.ktorm.entity.Entity
import org.ktorm.entity.sequenceOf
import org.ktorm.schema.*
import org.ktorm.dsl.*

// ktorm requires us to build an Entity
interface Penguin : Entity<Penguin> {
    val species : String
    val island : String
    val bill_length_mm : Double
    val bill_depth_mm : Double
    val flipper_length_mm : Int
    val body_mass_g : Int
    val sex : String
}

// and also a mapper from database columns to the Entity
object Penguins : Table<Penguin>("penguins") {
    val species = varchar("species").bindTo { it.species }
    val island = varchar("island").bindTo { it.island }
    val bill_length_mm = double("bill_length_mm").bindTo { it.bill_length_mm }
    val bill_depth_mm = double("bill_depth_mm").bindTo { it.bill_depth_mm }
    val flipper_length_mm = int("flipper_length_mm").bindTo { it.flipper_length_mm }
    val body_mass_g = int("body_mass_g").bindTo { it.body_mass_g }
    val sex = varchar("sex").bindTo { it.sex }
}

import org.ktorm.support.sqlite.SQLiteDialect

val dbPenguins = Database.connect(
    url = "jdbc:sqlite:penguins.db",
    dialect = SQLiteDialect()
)

// the benefit of defining the Entity is now we can write type-safe SQL using ktorm's DSL, 
//   and auto-map the query result to our Penguin class

dbPenguins
    .from(Penguins)
    .select()
    .where { Penguins.sex.eq("MALE") and Penguins.body_mass_g.gt(5000) }
    .orderBy(Penguins.flipper_length_mm.desc())
    .map { row -> 
        Penguins.createEntity(row)
    }.toDataFrame()

// String API - compiler does not know anything about the column, we must specify the data type
// only time this is necessary in Jupyter is within a cell when we create new columns on the fly then reference them in the same cell
dfPenguins
    .filter { "bill_length_mm"<Double>() != null && "bill_depth_mm"<Double>() != null }
    .add("bill_area_mm2") { "bill_length_mm"<Double>() * "bill_depth_mm"<Double>() }
    .mean("bill_area_mm2"<Double>())

// https://kotlin.github.io/dataframe/stringapi.html

750.7856432748542

// Column Accessors - we define the column once and then can reference it 
// in some context we need to use `invoke()` (shortened to just `()`)
val bill_length_mm by column<Double>()
val bill_depth_mm by column<Double>()
val bill_area_mm2 by column<Double>()

dfPenguins
    .filter { bill_length_mm != null && bill_depth_mm != null }
    .add( bill_area_mm2 ) { bill_length_mm() * bill_depth_mm() }
    .mean( bill_area_mm2 )

// https://kotlin.github.io/dataframe/columnaccessorsapi.html

750.7856432748542

// note this data class is not annotated with @DataSchema

// not sure when this is preferable to annotating and have the compiler ensure the references are type safe

data class PenguinClass(
    val species	: String,
    val island : String,
    val bill_length_mm : Double,
    val bill_depth_mm : Double,
    val flipper_length_mm : Int?,
    val body_mass_g	: Int?,
    val sex : String,
    val bill_area : Double
)

dfPenguins
    .dropNulls(PenguinClass::bill_length_mm)
    .dropNulls(PenguinClass::bill_depth_mm)
    .add( PenguinClass::bill_area ) { it.get(PenguinClass::bill_length_mm) *  it.get(PenguinClass::bill_depth_mm) }
    .mean( PenguinClass::bill_area )

// https://kotlin.github.io/dataframe/kpropertiesapi.html

750.7856432748542

// we're going to use a separate CSV reader to do our first pass
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader

val csvFile : File = File("E:\\Kotlin\\HundredSQLQueries\\src\\main\\TomBrady.csv")
val csvRaw : List<List<String>> = csvReader().readAll(csvFile)
// the headers are on the first 2 lines, so let's combine the first and second row into a single string
val hierarchalHeaders : List<String> = (csvRaw[0] zip csvRaw[1]).map { "${it.first};;${it.second}" }

val dfBradyRaw = DataFrame.readCSV(
    file = csvFile,
    skipLines = 2,
    header = hierarchalHeaders
).filter { ";;GS"<String>() == null || ";;GS"<String>() == "*" } // GS = game started - we eliminate any row where Brady was "inactive", "suspended", etc.
.parse() // the existence of "inactive", etc forced some numeric columns to look like Strings, now they are gone, parse() restores the correct data type

dfBradyRaw  // this is a flat table, next step is to reshape to hierarchal

// grouping columns together creates a hierarchal data set
// we use the header strings we created in the first step to easily identify the columns to be grouped

val dfBradyHierarchal = dfBradyRaw
    .group { cols { it.name.startsWith(";;") } }.into { "Meta" }
    .group { cols { it.name.startsWith("Passing;;") } }.into { "Passing" }
    .group { cols { it.name.startsWith("Rushing;;") } }.into { "Rushing" }
    .group { cols { it.name.startsWith("Receiving;;") } }.into { "Receiving" }
    .group { cols { it.name.startsWith("Scoring;;") } }.into { "Scoring" }
    .remove { cols { !it.isColumnGroup() } }  // we are going to ignore defense, kicking, punting, etc -- so anything that isn't in a group yet will be removed
    .rename { all().recursively() }.into { it.name.replace(";;", "") }

dfBradyHierarchal

// we have a good hierarchal data set however there are items we'd like to clean up

val dfBrady = dfBradyHierarchal
    .convert { "Meta"["untitled"] named "location" }.with { if (it == null) "home" else "away" }
    .convert { "Meta"["GS"] named "started" }.with { it == null } // started? true/false
    .add {  // break up "Result" into 3 fields -- win/loss, team score and opponent score
        "WinLoss" from { "Meta"["Result"]<String>().let { if (it.first() == 'W') "win" else if (it.first() == 'L') "loss" else "tie" } }
        "OurScore" from { "Meta"["Result"]<String>().let { it.drop(1).substringBefore("-").trim().toInt() } }
        "TheirScore" from { "Meta"["Result"]<String>().let { it.substringAfter("-").trim().toInt() } }
    }.move { "WinLoss" and "OurScore" and "TheirScore" }.under("Meta")
    .parse()

dfBrady

// let's get a summary of Brady's win/loss record against every team

// pivot will create a new column from every distinct value in "WinLoss" column (there are only 2)
//   typically needs 3 inputs or dimensions -- the field to pivot (horizontal), the group by (vertical) 
//     and usually an aggregation calculation for the inner cells

// some RDBMS's support PIVOT however it is usually less flexible -- 
//   for example, SQL Server's PIVOT requires the user to specify the name of every horizontal column
//     easy here (just "win" and "loss") but sometimes all the possible values are not known in advance

dfBrady
    .pivot(inward=false) { Meta.WinLoss }
        .groupBy { Meta.Opp named "opponent" } 
        .count()
    .sortBy { "opponent"<String>() }

// ok, so we know Brady's record against every team

// let's see how it evolves based upon yardage thresholds -- what does his record look like when he throws for 300+, or 400+ yards, vs other games

val yardsThresholds : List<Int> = listOf(0, 200, 300, 350, 400, 450, 500)

dfBrady
    .add("YardsThreshold") { yardsThresholds.filter { Passing.PassingYds > it } }//.under("Passing")
    .explode { "YardsThreshold"<List<Int>>() }
    .pivot(inward=false) { Meta.WinLoss }
        .groupBy { Meta.Opp  and "YardsThreshold"<List<Int>>() } 
        .count()
    .sortBy { "Opp" and "YardsThreshold" }
    .pivot { "YardsThreshold"<Int>() }
        .groupBy { "Opp"<String>() }    
        .values { "loss" and "win" }

// let's define a class with some member calculations

data class WinLossRecord(
    val wins: Int,
    val losses: Int
) {
    val winPct: Double get() = wins.toDouble().div(wins.plus(losses))
    val undefeated: Boolean get() = wins > 0 && losses == 0
}

// start with the same query as #90, 
// but we will add a column of the WinLossRecord class, and then call its methods directly

// in a real-world scenario, doesn't make much sense to create a class just to do simple calculations
//   but if you have a class already in use, you can easily create them from the data set (or import instances directly)

val WL by column<WinLossRecord>() 

dfBrady
    .pivot(inward=false) { Meta.WinLoss }
    .groupBy { Meta.Opp named "opponent" } 
    .count()
    .add(WL) { WinLossRecord("win"<Int>(), "loss"<Int>()) }
    .add {
        "WinPct" from { WL().winPct }            // note the (), we are working on the class instance in each row, not the DataColumn
        "Undefeated" from { WL().undefeated }
    }
    .sortBy { "WinPct"<Double>().desc() }

// what are Brady's highest passing yards total over 3 consecutive games?

dfBrady
    .select { Meta.Rk and Meta.Date and Passing.PassingYds }
    .add("3gamePassYds") { (relative(-2..0)["PassingYds"] as DataColumn<Int>).sum() } // relative allows us to lookback (or forward) to other rows
    .remove("PassingYds")
    .sortBy { "3gamePassYds"<Int>().desc() }
    .head(10)

// highest 3-game passing yards total was 1327 yards, ending 2011-09-25  
//   (517, 423, 387 to start the 2011 season, according to pro-football-reference)

// where did Brady have a bigger home-field advantage, New England or Tampa?

dfBrady
    .pivot() { Meta.WinLoss } 
        .groupBy { Meta.Tm and Meta.location } 
        .count()
    .flatten()
    .add("winPct") { WinLossRecord("win"<Int>(), "loss"<Int>()).winPct }
    .pivot() { "location"<String>() }
        .groupBy { "Tm"<String>() }
        .values { "winPct"<Double>() }
    .flatten()
    .add("homeAdv") { "home"<Double>() - "away"<Double>() }
    .sortBy { "homeAdv"<Double>().desc() }
    
// win %age at home in New England was 16% higher than when New England was the away team (85 vs 68%)
// in Tampa, the difference was 8% (68 to 60%)

// what is the longest consecutive streak of games where Brady has at least 1 touchdown and zero interceptions (min. 20 pass attempts)?

dfBrady
    .select { Meta.Rk and Meta.Date and Passing.PassingAtt and Passing.PassingTD and Passing.PassingInt }
    .filter { "PassingAtt"<Int>() >= 20 }  // filter out games without 20 attempts
    .add("qualifies") { "PassingTD"<Int>() >= 1 && "PassingInt"<Int>() == 0 } // determine whether a game qualifies
      // looking back 50 games is a "cheat", just a random guess than no streak is this long
      // so we get the last 50 games, convert it to a list of booleans, reverse it (so most recent games are first), 
      //    "take" until we hit a false, then measure the list size
    .add("streak") { (relative(-50..0)["qualifies"] as DataColumn<Boolean>).toList().reversed().takeWhile { it == true }.size }
    .filter { "streak"<Int>() > 0 }
    .sortBy { "streak"<Int>().desc() }

// note that we simply filter out games with fewer than 20 pass attempts -- they don't break the streak, we just ignore them

// longest streak = 10 games, ending 2010-12-26

// is there a correlaion between sacks and interceptions?

dfBrady
    .corr { Passing.PassingInt }.with { Passing.PassingSk }

// doesn't look like it

// let's look by year

dfBrady
    .distinct { Meta.Year }
    .map { it["Year"] }
    .associateWith { yr ->
        dfBrady
            .filter { Meta.Year == yr }
            .corr { Passing.PassingInt }.with { Passing.PassingSk }
            .values { "PassingSk"<Double>() }.first()
    }.let {
        dataFrameOf(
            "year" to it.keys.toList(),
            "corr" to it.values.toList()
        )
    }

// a few years have a significant positive correlation, but plenty of years have a negative one

// we thought there might be a correlation between sacks and interceptions (i.e. good defenses might generate both)
//   but the coefficient was very close to zero

// we can plot the count of games and get a visual look

// JetBrains recently introduced `kandy` plotting library (https://kotlin.github.io/kandy/overview.html)
// some other libraries include lets-plot, plotly and kravis

dfBrady.plot {
    statCount2D(Passing.PassingInt, Passing.PassingSk) {
        points {
            x(Stat.x) { 
                axis {
                    name = "Interceptions" 
                    breaks((0..4).toList(), format = "d")
                }
            }
            y(Stat.y) { 
                axis { 
                    name = "Sacks" 
                    breaks((0..7).toList())
                }
            }
            size(Stat.count) {
                scale = continuous(2.0..20.0) //continuous(1..10)
            }
            color = Color.BLUE
        }
    }
    layout {
        size = 720 to 480
    }
}

import java.time.Month

// kandy is still early in development, some advanced features such as faceted grids are incomplete and subject to API changes

dfBrady
    .add("month") { if (Meta.Date.month == Month.JANUARY) Month.DECEMBER else Meta.Date.month }
    .add("marginOfVictory") { Meta.OurScore - Meta.TheirScore }
    .select { "month" and "marginOfVictory" and Passing.PassingAtt and Meta.location and Meta.WinLoss and Meta.Opp and Meta.Result and Meta.Year and Meta.Week}
    .plot {
        points {
            x("PassingAtt") { axis.name = "Passing Attempts" }
            y("marginOfVictory"<Int>())
            symbol = Symbol.CIRCLE
            size = 3.5
            color("WinLoss") {
                scale = categorical(listOf(Color.RED, Color.GREEN))
            }
            tooltips("Year", "Week", "Opp", "Result")
        }

        facetGrid("month"<Month>(), "location"<String>(), 
                  scalesSharing = ScalesSharing.FIXED, 
                  xOrder = OrderDirection.DESCENDING,
                  yOrder = OrderDirection.DESCENDING
                 )
        layout {
            theme(Theme.Grey)
            size = 1080 to 540
        }
    }

// do more passing attempts hint at a loss? does it matter what month the game is?

// hierarchal data sets encode their sub-columns in JSON objects, 
//   which isn't very useful since Excel doesn't natively read or convert JSON
dfBrady.writeExcel("Brady.xlsx")

// however flat data works as expected
dfBrady.Passing.writeExcel("BradyPassing.xlsx")

// just like CSV and databases, you can choose a separate library instead of DataFrame native for reading/writing to Excel
// Apache POI is very good, and a number of other libraries use it under the hood

dfBrady.toStandaloneHTML(DisplayConfiguration(rowsLimit = null)).openInBrowser()

// can also write to a file -- documentation is limited but the API is available at:
//   https://github.com/Kotlin/dataframe/blob/master/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/format.kt

species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
Adelie	Torgersen	38.600000	21.200000	191	3800	MALE
Adelie	Torgersen	34.600000	21.100000	198	4400	MALE
Adelie	Torgersen	46.000000	21.500000	194	4200	MALE
Adelie	Dream	39.200000	21.100000	196	4150	MALE
Adelie	Dream	42.300000	21.200000	191	4150	MALE
Adelie	Biscoe	41.300000	21.100000	195	4400	MALE
Gentoo	Biscoe	48.700000	14.100000	210	4450	FEMALE
Gentoo	Biscoe	48.200000	14.300000	210	4600	FEMALE
Gentoo	Biscoe	45.100000	14.500000	207	5050	FEMALE
Gentoo	Biscoe	49.100000	14.800000	220	5150	FEMALE
Gentoo	Biscoe	44.900000	13.300000	213	5100	FEMALE
Gentoo	Biscoe	46.500000	14.800000	217	5200	FEMALE
Gentoo	Biscoe	49.100000	14.500000	212	4625	FEMALE
Gentoo	Biscoe	48.400000	14.400000	203	4625	FEMALE
Gentoo	Biscoe	48.500000	15.000000	219	4850	FEMALE
Gentoo	Biscoe	50.500000	15.200000	216	5000	FEMALE
Gentoo	Biscoe	45.200000	14.800000	212	5200	FEMALE

species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
Adelie	Torgersen	39.200000	19.600000	195	4675	MALE
Adelie	Dream	39.800000	19.100000	184	4650	MALE
Adelie	Dream	39.600000	18.800000	190	4600	MALE
Adelie	Torgersen	42.900000	17.600000	196	4700	MALE
Adelie	Biscoe	41.000000	20.000000	203	4725	MALE
Adelie	Biscoe	43.200000	19.000000	197	4775	MALE
Adelie	Biscoe	45.600000	20.300000	191	4600	MALE
Chinstrap	Dream	52.000000	20.700000	210	4800	MALE
Gentoo	Biscoe	48.400000	14.600000	213	5850	MALE
Gentoo	Biscoe	49.300000	15.700000	217	5850	MALE
Gentoo	Biscoe	49.200000	15.200000	221	6300	MALE
Gentoo	Biscoe	59.600000	17.000000	230	6050	MALE
Gentoo	Biscoe	49.500000	16.200000	229	5800	MALE
Gentoo	Biscoe	48.600000	16.000000	230	5800	MALE
Gentoo	Biscoe	51.100000	16.300000	220	6000	MALE
Gentoo	Biscoe	45.200000	16.400000	223	5950	MALE
Gentoo	Biscoe	49.800000	15.900000	229	5950	MALE
Gentoo	Biscoe	55.100000	16.000000	230	5850	MALE
Gentoo	Biscoe	48.800000	16.200000	222	6000	MALE

flipper_length_cm	body_mass_kg
18.100000	3.750000
18.600000	3.800000
19.500000	3.250000

flipperLengthCM	bodyMassKG	whereFound
18.100000	3.750000	Torgersen
18.600000	3.800000	Torgersen
19.500000	3.250000	Torgersen
19.300000	3.450000	Torgersen
19.000000	3.650000	Torgersen
18.100000	3.625000	Torgersen
19.300000	3.475000	Torgersen
18.600000	3.300000	Torgersen
18.000000	3.700000	Torgersen
18.200000	3.200000	Torgersen
19.100000	3.800000	Torgersen
18.500000	3.700000	Torgersen
19.500000	3.450000	Torgersen
18.400000	3.325000	Torgersen
17.400000	3.400000	Biscoe
18.000000	3.600000	Biscoe
18.900000	3.800000	Biscoe
18.500000	3.950000	Biscoe
18.000000	3.800000	Biscoe
18.700000	3.800000	Biscoe

flipperLengthCM	bodyMassKG	whereFound
18.100000	3.750000	Torgersen
18.600000	3.800000	Torgersen
19.500000	3.250000	Torgersen
null	null	Torgersen
19.300000	3.450000	Torgersen

species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
Adelie	Dream	37.2	18.1	178	3900	MALE
Adelie	Dream	37.6	19.3	181	3300	FEMALE
Gentoo	Biscoe	50	15.3	220	5550	MALE
Adelie	Torgersen	37.3	20.5	199	3775	MALE
Adelie	Biscoe	39.6	17.7	186	3500	FEMALE
Gentoo	Biscoe	47.7	15	216	4750	FEMALE
Adelie	Dream	36.5	18	182	3150	FEMALE
Gentoo	Biscoe	42	13.5	210	4150	FEMALE
Adelie	Torgersen	42.1	19.1	195	4000	MALE
Gentoo	Biscoe	54.3	15.7	231	5650	MALE

name	type	count	unique	nulls	top	freq	min	median	max
species	String?	344	3	0	Adelie	152	Adelie	Chinstrap	Gentoo
island	String?	344	3	0	Biscoe	168	Biscoe	Dream	Torgersen
bill_length_mm	String?	344	165	2	41.1	7	32.1	44.4	59.6
bill_depth_mm	String?	344	81	2	17	12	13.1	17.3	21.5
flipper_length_mm	String?	344	56	2	190	22	172	197	231
body_mass_g	String?	344	95	2	3800	12	2700	4050	6300
sex	String?	344	3	11	MALE	168	FEMALE	MALE	MALE

name	type	count	unique	nulls	top	freq	mean	std	min	median	max
species	String	344	3	0	Adelie	152	null	null	Adelie	Chinstrap	Gentoo
island	String	344	3	0	Biscoe	168	null	null	Biscoe	Dream	Torgersen
bill_length_mm	Double?	344	165	2	41.100000	7	43.921930	5.459584	32.100000	44.450000	59.600000
bill_depth_mm	Double?	344	81	2	17.000000	12	17.151170	1.974793	13.100000	17.300000	21.500000
flipper_length_mm	Int?	344	56	2	190	22	200.915205	14.061714	172	197	231
body_mass_g	Int?	344	95	2	3800	12	4201.754386	801.954536	2700	4050	6300
sex	String?	344	3	11	MALE	168	null	null	FEMALE	MALE	MALE

person	job	name	billable
mik	calibrate	calibrate	1.500000
mik	calibrate	clean	0.500000
mik	clean	calibrate	1.500000
mik	clean	clean	0.500000
mik	complain	calibrate	1.500000
mik	complain	clean	0.500000
po	clean	calibrate	1.500000
po	clean	clean	0.500000
po	complain	calibrate	1.500000
po	complain	clean	0.500000
tay	complain	calibrate	1.500000
tay	complain	clean	0.500000

species	body_mass_g	bodyMassGroupAvg
Adelie	3750	3700.700000
Adelie	3800	3700.700000
Adelie	4675	3700.700000
Adelie	4250	3700.700000
Adelie	3800	3700.700000

species	body_mass_g	bodyMassGroupAvg
Adelie	3750	3700.662252
Adelie	3800	3700.662252
Adelie	4675	3700.662252
Adelie	4250	3700.662252
Adelie	3800	3700.662252

species	size	count
Adelie	small	54
Adelie	large	97
Chinstrap	small	17
Chinstrap	large	51
Gentoo	large	123

species	size	count
Adelie	abnormal	54
Adelie	normal	97
Chinstrap	abnormal	17
Chinstrap	normal	51
Gentoo	abnormal	61
Gentoo	normal	62

ident	personal	family	dept	age
1	Kartik	Gupta	null	46
2	Divit	Dhaliwal	hist	34
3	Indrans	Sridhar	mb	47
4	Pranay	Khanna	mb	51
5	Riaan	Dua	null	23
6	Vedika	Rout	hist	45
7	Abram	Chokshi	gen	23
8	Romil	Kapoor	hist	38
9	Ishaan	Ramaswamy	mb	35
10	Nitya	Lal	gen	52

ident	kind	started	ended
17	trial	2023-01-29	2023-01-30
35	calibration	2023-01-30	2023-01-30
36	trial	2023-02-02	2023-02-03
25	trial	2023-02-12	2023-02-14
2	calibration	2023-02-14	2023-02-14
40	calibration	2024-01-21	2024-01-21
12	trial	2024-01-26	2024-01-28
44	trial	2024-01-27	2024-01-29
34	trial	2024-02-01	2024-02-02
14	calibration	2024-02-03	2024-02-03

rand	selector	name
-8912889188014221159	1	Kartik Gupta
5976975164977345010	1	Divit Dhaliwal
-2789984030899021634	0	Indrans Sridhar
-9214500621502408558	4	Pranay Khanna
-3180529001518403764	4	Romil Kapoor

day	experimentCount
2023-01-29	1
2023-01-30	1
2023-01-31	0
2023-02-01	0
2023-02-02	1
2023-02-03	0
2023-02-04	0
2023-02-05	0
2023-02-06	0
2023-02-07	0
2023-02-08	0
2023-02-09	0
2023-02-10	0
2023-02-11	0
2023-02-12	1
2023-02-13	0
2023-02-14	1
2023-02-15	1
2023-02-16	0
2023-02-17	0

person_1	person_2
Kartik Gupta	Vedika Rout
Pranay Khanna	Vedika Rout
Indrans Sridhar	Romil Kapoor
Abram Chokshi	Ishaan Ramaswamy
Pranay Khanna	Vedika Rout
Kartik Gupta	Abram Chokshi
Abram Chokshi	Romil Kapoor
Kartik Gupta	Divit Dhaliwal
Divit Dhaliwal	Abram Chokshi
Pranay Khanna	Ishaan Ramaswamy
Indrans Sridhar	Romil Kapoor
Kartik Gupta	Ishaan Ramaswamy
Kartik Gupta	Nitya Lal
Kartik Gupta	Abram Chokshi
Pranay Khanna	Romil Kapoor

ident	name	building
gen	Genetics	Chesson
hist	Histology	Fashet Extension
mb	Molecular Biology	Chesson
end	Endocrinology	TGVH

ident	kind	started	ended
1	calibration	2023-08-25	2023-08-25
2	calibration	2023-02-14	2023-02-14
3	trial	2023-02-22	2023-02-24
4	trial	2023-10-10	2023-10-11
5	calibration	2024-01-19	2024-01-19
6	trial	2023-02-15	2023-02-16
7	trial	2023-06-22	2023-06-23
8	trial	2023-05-03	2023-05-04
9	calibration	2024-01-19	2024-01-19
10	trial	2023-05-23	2023-05-24
11	trial	2023-03-17	2023-03-18
12	trial	2024-01-26	2024-01-28
13	trial	2023-05-25	2023-05-26
14	calibration	2024-02-03	2024-02-03
15	trial	2023-03-09	2023-03-11
16	calibration	2023-05-22	2023-05-22
17	trial	2023-01-29	2023-01-30
18	calibration	2023-03-15	2023-03-15
19	calibration	2023-09-21	2023-09-21
20	trial	2023-12-16	2023-12-17

id	startMonth	experiments
0	2023-01-01	2
1	2023-02-01	5
2	2023-03-01	5
3	2023-04-01	1
4	2023-05-01	6
5	2023-06-01	5
6	2023-07-01	3
7	2023-08-01	2
8	2023-09-01	4
9	2023-10-01	6
10	2023-12-01	4
11	2024-01-01	5
12	2024-02-01	2

id	startMonth	experiments	runningTotal	progress
0	2023-01-01	2	2	0.040000
1	2023-02-01	5	7	0.140000
2	2023-03-01	5	12	0.240000
3	2023-04-01	1	13	0.260000
4	2023-05-01	6	19	0.380000
5	2023-06-01	5	24	0.480000
6	2023-07-01	3	27	0.540000
7	2023-08-01	2	29	0.580000
8	2023-09-01	4	33	0.660000
9	2023-10-01	6	39	0.780000
10	2023-12-01	4	43	0.860000
11	2024-01-01	5	48	0.960000
12	2024-02-01	2	50	1.000000

theYear	theMonth	experiments
2023	1	2
2023	2	5
2023	3	5
2023	4	1
2023	5	6
2023	6	5
2023	7	3
2023	8	2
2023	9	4
2023	10	6
2023	12	4
2024	1	5
2024	2	2

ident	name	details
1	WY401	{"acquired": "2023-05-01"}
2	Inphormex	{"acquired": "2021-07-15", "refurbish...
3	AutoPlate 9000	{"note": "needs software update"}

name	type	count	unique	top	freq	mean	std	min	median	max
ident	Int?	3	3	1	1	2.000000	1.000000	1	2	3
name	String	3	3	WY401	1	null	null	AutoPlate 9000	Inphormex	WY401
details	String	3	3	{"acquired": "2023-05-01"}	1	null	null	{"acquired": "2021-07-15", "refurbish...	{"acquired": "2023-05-01"}	{"note": "needs software update"}

Kotlin Dataframe in 100 Queries¶

1: select constant¶

2: reading table from database¶

2a: reading a table from a database and assigning it to a df variable¶

3: specifying columns with select¶

3a: defining a typed schema¶

4: sort with sortBy¶

5: limit output with take¶

6: page output with drop and take¶

7: remove duplicates with distinct¶

8. filter results with filter¶

9. more complex filters¶

10. perform some math calculations¶

11. rename columns¶

12. calculate with missing values¶

13. null equality¶

14. null inequality (SQL != Kotlin)¶

15. ternary logic (SQL != Kotlin)¶

16. null conditions¶

17. aggregations¶

18. common aggregation functions¶

19. counting¶

20. group, 21. behavior of unaggregated columns¶

22. arbitrary choice in aggregation¶

23. filter aggregated values¶

24. readable output¶

25. filter aggregate inputs¶

26. create tables / build a dataframe, 27. insert data¶

28. update rows with, yes, update¶

29. delete rows with drop¶

30. backing up¶

31. join tables¶

32. inner join¶

33. aggregate joined data¶

34. left join¶

35. aggregate left joins¶

36. coalesce values¶

37. negate incorrectly¶

38. set membership¶

39. subqueries¶

40. autoincrement and primary key¶

41. alter dataframe schema¶

42. create new dataframes from old¶

43. remove tables¶

44. compare individual values to aggregates¶

45. compare individual values to aggregates within groups¶

46. common table expressions (CTEs)¶

47. enumerate rows¶

48. if-else function¶

49. select case with when¶

50. check range¶

51. pattern matching¶

52. select first and last rows with head and tail¶

53. intersection¶

54. exclusion¶

55. random numbers¶

56. creating an index¶

57. generate sequence¶

58. generate sequence based on data¶

59. generate sequence of dates¶

60. count experiments started per day without gaps¶

61. self join¶

62. generate unique pairs¶

63. filter pairs¶

64. existence and correlated subqueries¶

65. non-existence¶

66. lead and lag¶

67. window functions¶

68. partitioned windows¶

69. blobs¶

70. store JSON¶

71. select field from JSON¶

72. JSON array access¶

73. unpack a JSON array¶

74. last element of array¶

75. modify JSON¶

76. tombstones¶

77. views¶

78. add check¶

79. transactions¶

3: specifying columns with `select`¶

4: sort with `sortBy`¶

5: limit output with `take`¶

6: page output with `drop` and `take`¶

7: remove duplicates with `distinct`¶

8. filter results with `filter`¶

28. update rows with, yes, `update`¶

29. delete rows with `drop`¶

49. select case with `when`¶

52. select first and last rows with `head` and `tail`¶

ident	name	details
1	WY401	{ acquired:2023-05-01 }
2	Inphormex	{ acquired:2021-07-15, refurbished:20...
3	AutoPlate 9000	{ note:needs software update }

ident	name	details
1	WY401	MachineDetails(acquired=2023-05-01, r...
2	Inphormex	MachineDetails(acquired=2021-07-15, r...
3	AutoPlate 9000	MachineDetails(acquired=null, refurbi...

ident	log
1	[{"machine": "Inphormex", "person": [...
2	[{"machine": "Inphormex", "person": [...
3	[{"machine": "sterilizer", "person": ...
4	[{"machine": "sterilizer", "person": ...
5	[{"machine": "AutoPlate 9000", "perso...
6	[{"machine": "sterilizer", "person": ...
7	[{"machine": "WY401", "person": ["Mau...
8	[{"machine": "AutoPlate 9000"}]

ident	mylog	length	split1
1	[4 x 2]	4	{ machine:Inphormex, person:[Gabriell...
2	[5 x 2]	5	{ machine:Inphormex, person:[Marianne...
3	[2 x 2]	2	{ machine:sterilizer, person:[Josette...
4	[1 x 2] { machine:sterilizer, person:...	1	{ machine:sterilizer, person:[Maude, ...
5	[2 x 2]	2	{ machine:AutoPlate 9000, person:[Bri...
6	[1 x 2] { machine:sterilizer, person:...	1	{ machine:sterilizer, person:[Mariann...
7	[3 x 2]	3	{ machine:WY401, person:[Maude, Goule...
8	[1 x 1] { machine:AutoPlate 9000 }	1	{ machine:AutoPlate 9000 }

ident	length	first
1	4	UsageDetail(machine=Inphormex, person...
2	5	UsageDetail(machine=Inphormex, person...
3	2	UsageDetail(machine=sterilizer, perso...
4	1	UsageDetail(machine=sterilizer, perso...
5	2	UsageDetail(machine=AutoPlate 9000, p...
6	1	UsageDetail(machine=sterilizer, perso...
7	3	UsageDetail(machine=WY401, person=[Ma...
8	1	UsageDetail(machine=AutoPlate 9000, p...

ident	udList
1	UsageDetail(machine=Inphormex, person...
1	UsageDetail(machine=Inphormex, person...
1	UsageDetail(machine=WY401, person=[Ga...
1	UsageDetail(machine=Inphormex, person...
2	UsageDetail(machine=Inphormex, person...
2	UsageDetail(machine=AutoPlate 9000, p...
2	UsageDetail(machine=sterilizer, perso...
2	UsageDetail(machine=AutoPlate 9000, p...
2	UsageDetail(machine=sterilizer, perso...
3	UsageDetail(machine=sterilizer, perso...
3	UsageDetail(machine=Inphormex, person...
4	UsageDetail(machine=sterilizer, perso...
5	UsageDetail(machine=AutoPlate 9000, p...
5	UsageDetail(machine=sterilizer, perso...
6	UsageDetail(machine=sterilizer, perso...
7	UsageDetail(machine=WY401, person=[Ma...
7	UsageDetail(machine=Inphormex, person...
7	UsageDetail(machine=AutoPlate 9000, p...
8	UsageDetail(machine=AutoPlate 9000, p...

ident	lastMachine
1	Inphormex
2	sterilizer
3	Inphormex
4	sterilizer
5	sterilizer
6	sterilizer
7	AutoPlate 9000
8	AutoPlate 9000

ident	name	details
1	WY401	{"acquired":"2023-05-01","sold":"2024...
2	Inphormex	{"acquired":"2021-07-15","refurbished...
3	AutoPlate 9000	{"note":"needs software update","sold...

name	bankBalance
Monica	200
Ross	600
Phoebe	1000
Joey	100
Rachel	800
Chandler	750
Gunther	125

parent	child
Arturo	Clemente
Darío	Clemente
Clemente	Homero
Clemente	Ivonne
Ivonne	Lourdes
Soledad	Lourdes
Lourdes	Santiago