From 0d2be018c29713555caa016f8e80c013b39c7b70 Mon Sep 17 00:00:00 2001 From: Julien Lengrand-Lambert Date: Wed, 11 Mar 2020 09:06:06 +0100 Subject: [PATCH] Better naming and add episodes table --- README.md | 2 ++ src/main/kotlin/{loader => }/LoadImdb.kt | 14 ++++---- src/main/kotlin/dsl/Crew.kt | 14 ++++++++ src/main/kotlin/dsl/Episodes.kt | 14 ++++++++ src/main/kotlin/dsl/Names.kt | 2 +- .../dsl/{TitleRatings.kt => Ratings.kt} | 2 +- src/main/kotlin/dsl/Titles.kt | 4 +-- src/main/kotlin/loader/CrewsLoader.kt | 35 ++++++++++++++++++ src/main/kotlin/loader/EpisodesLoader.kt | 36 +++++++++++++++++++ .../{NameBasicsLoader.kt => NamesLoader.kt} | 2 +- ...TitleRatingsLoader.kt => RatingsLoader.kt} | 12 +++---- .../{TitleBasicsLoader.kt => TitlesLoader.kt} | 2 +- src/main/kotlin/test.kt | 2 +- 13 files changed, 120 insertions(+), 21 deletions(-) rename src/main/kotlin/{loader => }/LoadImdb.kt (86%) create mode 100644 src/main/kotlin/dsl/Crew.kt create mode 100644 src/main/kotlin/dsl/Episodes.kt rename src/main/kotlin/dsl/{TitleRatings.kt => Ratings.kt} (92%) create mode 100644 src/main/kotlin/loader/CrewsLoader.kt create mode 100644 src/main/kotlin/loader/EpisodesLoader.kt rename src/main/kotlin/loader/{NameBasicsLoader.kt => NamesLoader.kt} (97%) rename src/main/kotlin/loader/{TitleRatingsLoader.kt => RatingsLoader.kt} (67%) rename src/main/kotlin/loader/{TitleBasicsLoader.kt => TitlesLoader.kt} (97%) diff --git a/README.md b/README.md index 211278f..09d2d43 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,8 @@ See [LICENSE](/LICENSE) * `Database.connect("jdbc:mysql://localhost:3308/imdb?useSSL=false", driver = "com.mysql.jdbc.Driver", user = "root", password = "aRootPassword")` * Use `rewriteBatchedStatements=true` when inserting large volumes of data to have your driver rewrite your query * `.map` keeps stack of memory while `for` loop doesn't? I get a OME when running with map +* reason to use partitions +* dsl loading copy paste prone ## Author diff --git a/src/main/kotlin/loader/LoadImdb.kt b/src/main/kotlin/LoadImdb.kt similarity index 86% rename from src/main/kotlin/loader/LoadImdb.kt rename to src/main/kotlin/LoadImdb.kt index d53ffe1..573a122 100644 --- a/src/main/kotlin/loader/LoadImdb.kt +++ b/src/main/kotlin/LoadImdb.kt @@ -1,13 +1,8 @@ -package loader - -import dsl.TitleRatings -import dsl.Titles +import loader.CrewsLoader +import loader.EpisodesLoader import org.jetbrains.exposed.sql.Database import org.jetbrains.exposed.sql.SchemaUtils -import org.jetbrains.exposed.sql.statements.BatchInsertStatement import org.jetbrains.exposed.sql.transactions.transaction -import tsv.Reader -import kotlin.system.measureTimeMillis fun main() { @@ -20,6 +15,7 @@ fun main() { password = "" ) + transaction(db) { SchemaUtils.dropDatabase("imdb") } transaction(db) { SchemaUtils.createDatabase("imdb") } db = Database.connect( @@ -32,5 +28,7 @@ fun main() { // TitleRatingsLoader.load(db) // TitleBasicsLoader.load(db) - NameBasicsLoader.load(db) +// NameBasicsLoader.load(db) +// CrewsLoader.load(db) + EpisodesLoader.load(db) } \ No newline at end of file diff --git a/src/main/kotlin/dsl/Crew.kt b/src/main/kotlin/dsl/Crew.kt new file mode 100644 index 0000000..e95dffb --- /dev/null +++ b/src/main/kotlin/dsl/Crew.kt @@ -0,0 +1,14 @@ +package dsl + +import org.jetbrains.exposed.sql.Column +import org.jetbrains.exposed.sql.Table +import org.jetbrains.exposed.sql.statements.api.ExposedBlob + +object Crew : Table(){ + val tconst : Column = varchar("tconst", 10).uniqueIndex() + val directors : Column = text("directors") // TODO: better + val writers : Column = text("writers") // TODO: better + + override val primaryKey = PrimaryKey(tconst, name = "tconst") + +} \ No newline at end of file diff --git a/src/main/kotlin/dsl/Episodes.kt b/src/main/kotlin/dsl/Episodes.kt new file mode 100644 index 0000000..6a81b4d --- /dev/null +++ b/src/main/kotlin/dsl/Episodes.kt @@ -0,0 +1,14 @@ +package dsl + +import org.jetbrains.exposed.sql.Column +import org.jetbrains.exposed.sql.Table + +object Episodes : Table(){ + val tconst : Column = Episodes.varchar("tconst", 10).uniqueIndex() + val parentTconst : Column = Episodes.varchar("parentTconst", 10) + val seasonNumber : Column = Episodes.integer("seasonNumber").nullable() + val episodeNumber : Column = Episodes.integer("episodeNumber").nullable() + + override val primaryKey = PrimaryKey(tconst, name = "tconst") + +} \ No newline at end of file diff --git a/src/main/kotlin/dsl/Names.kt b/src/main/kotlin/dsl/Names.kt index a88be74..3c02eab 100644 --- a/src/main/kotlin/dsl/Names.kt +++ b/src/main/kotlin/dsl/Names.kt @@ -9,7 +9,7 @@ object Names : Table(){ val birthYear : Column = integer("birthYear").nullable() val deathYear : Column = integer("deathYear").nullable() val primaryProfession : Column = varchar("primaryProfession", 500) - val knownForTitles : Column = varchar("knownForTitles", 200) // Improve! + val knownForTitles : Column = varchar("knownForTitles", 200) // TODO: Improve! override val primaryKey = PrimaryKey(nconst, name = "nconst") } \ No newline at end of file diff --git a/src/main/kotlin/dsl/TitleRatings.kt b/src/main/kotlin/dsl/Ratings.kt similarity index 92% rename from src/main/kotlin/dsl/TitleRatings.kt rename to src/main/kotlin/dsl/Ratings.kt index 4bbe8c9..02996f6 100644 --- a/src/main/kotlin/dsl/TitleRatings.kt +++ b/src/main/kotlin/dsl/Ratings.kt @@ -3,7 +3,7 @@ package dsl import org.jetbrains.exposed.sql.Column import org.jetbrains.exposed.sql.Table -object TitleRatings : Table(){ +object Ratings : Table(){ val tconst : Column = varchar("tconst", 10).uniqueIndex() val averageRating : Column = float("averageRating").nullable() val numVotes : Column = integer("numVotes").nullable() diff --git a/src/main/kotlin/dsl/Titles.kt b/src/main/kotlin/dsl/Titles.kt index b8ad87b..83c651e 100644 --- a/src/main/kotlin/dsl/Titles.kt +++ b/src/main/kotlin/dsl/Titles.kt @@ -5,14 +5,14 @@ import org.jetbrains.exposed.sql.Table object Titles : Table(){ val tconst : Column = varchar("tconst", 10).uniqueIndex() - val titleType : Column = varchar("titleType", 50) // Own Table? + val titleType : Column = varchar("titleType", 50) // TODO: Own Table? val primaryTitle : Column = varchar("primaryTitle", 500) val originalTitle : Column = varchar("originalTitle", 500) val isAdult : Column = bool("isAdult") val startYear : Column = integer("startYear").nullable() val endYear : Column = integer("endYear").nullable() val runtimeMinutes : Column = long("runtimeMinutes").nullable() - val genres : Column = varchar("genres", 50) // Own Table? + val genres : Column = varchar("genres", 50) // TODO: Own Table? override val primaryKey = PrimaryKey(tconst, name = "tconst") } \ No newline at end of file diff --git a/src/main/kotlin/loader/CrewsLoader.kt b/src/main/kotlin/loader/CrewsLoader.kt new file mode 100644 index 0000000..9960ab0 --- /dev/null +++ b/src/main/kotlin/loader/CrewsLoader.kt @@ -0,0 +1,35 @@ +package loader + +import dsl.Crew +import org.jetbrains.exposed.sql.Database +import org.jetbrains.exposed.sql.statements.BatchInsertStatement +import kotlin.system.measureTimeMillis + + +object CrewsLoader { + + fun load(db: Database){ + println("Loading Crews") + + val time = measureTimeMillis() { // duplication + + TableLoader.process(db, + Crew, + "./datasets/title.crew.tsv", + 5000, + insert() + ) + } + println("Time was : ${time / 1000 / 60 } minutes ${time / 1000 % 60 } seconds") + } +} + +private fun insert(): BatchInsertStatement.(String) -> Unit { + return { + val items = it.split("\t") + + this[Crew.tconst] = items[0] + this[Crew.directors] = items[1] + this[Crew.writers] = items[2] + } +} \ No newline at end of file diff --git a/src/main/kotlin/loader/EpisodesLoader.kt b/src/main/kotlin/loader/EpisodesLoader.kt new file mode 100644 index 0000000..51cc2db --- /dev/null +++ b/src/main/kotlin/loader/EpisodesLoader.kt @@ -0,0 +1,36 @@ +package loader + +import dsl.Episodes +import org.jetbrains.exposed.sql.Database +import org.jetbrains.exposed.sql.statements.BatchInsertStatement +import tsv.Reader +import kotlin.system.measureTimeMillis + +object EpisodesLoader{ + + fun load(db: Database){ + println("Loading Episodes") + + val time = measureTimeMillis() { // duplication + + TableLoader.process(db, + Episodes, + "./datasets/title.episode.tsv", + 1000, + insert() + ) + } + println("Time was : ${time / 1000 / 60 } minutes ${time / 1000 % 60 } seconds") + } +} + +private fun insert(): BatchInsertStatement.(String) -> Unit { + return { + val items = it.split("\t") + + this[Episodes.tconst] = items[0] + this[Episodes.parentTconst] = items[1] + this[Episodes.seasonNumber] = if (items[2] != Reader.NO_DATA) items[2].toInt() else null + this[Episodes.episodeNumber] = if (items[3] != Reader.NO_DATA) items[3].toInt() else null + } +} \ No newline at end of file diff --git a/src/main/kotlin/loader/NameBasicsLoader.kt b/src/main/kotlin/loader/NamesLoader.kt similarity index 97% rename from src/main/kotlin/loader/NameBasicsLoader.kt rename to src/main/kotlin/loader/NamesLoader.kt index 89f8ede..00b709c 100644 --- a/src/main/kotlin/loader/NameBasicsLoader.kt +++ b/src/main/kotlin/loader/NamesLoader.kt @@ -10,7 +10,7 @@ import org.jetbrains.exposed.sql.statements.BatchInsertStatement import tsv.Reader import kotlin.system.measureTimeMillis -object NameBasicsLoader{ +object NameLoader{ fun load(db: Database){ println("Loading Names Basics") diff --git a/src/main/kotlin/loader/TitleRatingsLoader.kt b/src/main/kotlin/loader/RatingsLoader.kt similarity index 67% rename from src/main/kotlin/loader/TitleRatingsLoader.kt rename to src/main/kotlin/loader/RatingsLoader.kt index ca07767..bf6b8ef 100644 --- a/src/main/kotlin/loader/TitleRatingsLoader.kt +++ b/src/main/kotlin/loader/RatingsLoader.kt @@ -1,13 +1,13 @@ package loader -import dsl.TitleRatings +import dsl.Ratings import org.jetbrains.exposed.sql.Database import org.jetbrains.exposed.sql.statements.BatchInsertStatement import tsv.Reader import kotlin.system.measureTimeMillis -object TitleRatingsLoader { +object RatingsLoader { fun load(db: Database){ println("Loading Title Ratings") @@ -15,7 +15,7 @@ object TitleRatingsLoader { val time = measureTimeMillis() { // duplication TableLoader.process(db, - TitleRatings, + Ratings, "./datasets/title.ratings.tsv", 1, insert() @@ -30,8 +30,8 @@ private fun insert(): BatchInsertStatement.(String) -> Unit { return { val items = it.split("\t") - this[TitleRatings.tconst] = items[0] - this[TitleRatings.averageRating] = if (items[1] != Reader.NO_DATA) items[1].toFloat() else null - this[TitleRatings.numVotes] = if (items[2] != Reader.NO_DATA) items[2].toInt() else null + this[Ratings.tconst] = items[0] + this[Ratings.averageRating] = if (items[1] != Reader.NO_DATA) items[1].toFloat() else null + this[Ratings.numVotes] = if (items[2] != Reader.NO_DATA) items[2].toInt() else null } } \ No newline at end of file diff --git a/src/main/kotlin/loader/TitleBasicsLoader.kt b/src/main/kotlin/loader/TitlesLoader.kt similarity index 97% rename from src/main/kotlin/loader/TitleBasicsLoader.kt rename to src/main/kotlin/loader/TitlesLoader.kt index d0484dc..8a7f689 100644 --- a/src/main/kotlin/loader/TitleBasicsLoader.kt +++ b/src/main/kotlin/loader/TitlesLoader.kt @@ -6,7 +6,7 @@ import org.jetbrains.exposed.sql.statements.BatchInsertStatement import tsv.Reader import kotlin.system.measureTimeMillis -object TitleBasicsLoader{ +object TitleLoader{ fun load(db: Database){ println("Loading Title Basics") diff --git a/src/main/kotlin/test.kt b/src/main/kotlin/test.kt index 12f5c4f..b6f8ec8 100644 --- a/src/main/kotlin/test.kt +++ b/src/main/kotlin/test.kt @@ -18,7 +18,7 @@ object Cities : Table() { override val primaryKey = PrimaryKey(id, name = "PK_Cities_ID") } -fun main() { +fun main2() { Database.connect("jdbc:h2:mem:test", driver = "org.h2.Driver", user = "root", password = "") transaction {