mirror of
https://github.com/jlengrand/OpenGraphKt.git
synced 2026-03-10 08:31:23 +00:00
Fix types (#22)
* Improves types * Adds missing properties to music album * Changes gender from String to Enum * Changes URL to an actual URL * Fix typo * Adds scalable live testing on real data * Uses OffsetDateTime for articles, videos and books
This commit is contained in:
committed by
GitHub
parent
79b169fa81
commit
5372fab21c
1
.idea/gradle.xml
generated
1
.idea/gradle.xml
generated
@@ -12,6 +12,7 @@
|
|||||||
<option value="$PROJECT_DIR$/demo" />
|
<option value="$PROJECT_DIR$/demo" />
|
||||||
<option value="$PROJECT_DIR$/demo-remote" />
|
<option value="$PROJECT_DIR$/demo-remote" />
|
||||||
<option value="$PROJECT_DIR$/opengraphkt" />
|
<option value="$PROJECT_DIR$/opengraphkt" />
|
||||||
|
<option value="$PROJECT_DIR$/scrape-test" />
|
||||||
</set>
|
</set>
|
||||||
</option>
|
</option>
|
||||||
</GradleProjectSettings>
|
</GradleProjectSettings>
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
package fr.lengrand.opengraphkt
|
package fr.lengrand.opengraphkt
|
||||||
|
|
||||||
|
import java.net.URL
|
||||||
|
import java.time.OffsetDateTime
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enum representing the different types of Open Graph objects.
|
* Enum representing the different types of Open Graph objects.
|
||||||
*/
|
*/
|
||||||
@@ -46,6 +49,21 @@ enum class Type {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum class Gender {
|
||||||
|
MALE,
|
||||||
|
FEMALE;
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
fun fromString(gender: String): Gender {
|
||||||
|
return valueOf(gender.uppercase())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun toString(): String {
|
||||||
|
return this.name.lowercase()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
data class Tag(
|
data class Tag(
|
||||||
val property: String,
|
val property: String,
|
||||||
val content: String,
|
val content: String,
|
||||||
@@ -60,15 +78,15 @@ data class Data(
|
|||||||
// Basic metadata
|
// Basic metadata
|
||||||
val title: String?,
|
val title: String?,
|
||||||
val type: String?,
|
val type: String?,
|
||||||
val url: String?,
|
val url: URL?,
|
||||||
val description: String?,
|
val description: String?,
|
||||||
|
|
||||||
|
// Other metadata
|
||||||
val siteName: String?,
|
val siteName: String?,
|
||||||
val determiner: String?,
|
val determiner: String?,
|
||||||
val locale: String?,
|
val locale: String?,
|
||||||
val localeAlternate: List<String>,
|
val localeAlternate: List<String>,
|
||||||
|
|
||||||
// Structured properties
|
|
||||||
val images: List<Image>,
|
val images: List<Image>,
|
||||||
val videos: List<Video>,
|
val videos: List<Video>,
|
||||||
val audios: List<Audio>,
|
val audios: List<Audio>,
|
||||||
@@ -77,14 +95,10 @@ data class Data(
|
|||||||
val article: Article?,
|
val article: Article?,
|
||||||
val profile: Profile?,
|
val profile: Profile?,
|
||||||
val book: Book?,
|
val book: Book?,
|
||||||
|
|
||||||
// Music types
|
|
||||||
val musicSong: MusicSong?,
|
val musicSong: MusicSong?,
|
||||||
val musicAlbum: MusicAlbum?,
|
val musicAlbum: MusicAlbum?,
|
||||||
val musicPlaylist: MusicPlaylist?,
|
val musicPlaylist: MusicPlaylist?,
|
||||||
val musicRadioStation: MusicRadioStation?,
|
val musicRadioStation: MusicRadioStation?,
|
||||||
|
|
||||||
// Video types
|
|
||||||
val videoMovie: VideoMovie?,
|
val videoMovie: VideoMovie?,
|
||||||
val videoEpisode: VideoEpisode?
|
val videoEpisode: VideoEpisode?
|
||||||
) {
|
) {
|
||||||
@@ -137,16 +151,19 @@ data class Audio(
|
|||||||
val type: String?
|
val type: String?
|
||||||
)
|
)
|
||||||
|
|
||||||
/**
|
|
||||||
* * video.tv_show - same as video.movie
|
|
||||||
* * video.other - same as video.movie
|
|
||||||
*/
|
|
||||||
data class Article(
|
data class Article(
|
||||||
val publishedTime: String?,
|
val publishedTime: OffsetDateTime?,
|
||||||
val modifiedTime: String?,
|
val modifiedTime: OffsetDateTime?,
|
||||||
val expirationTime: String?,
|
val expirationTime: OffsetDateTime?,
|
||||||
val section: String?,
|
|
||||||
val authors: List<String>,
|
val authors: List<String>,
|
||||||
|
val section: String?,
|
||||||
|
val tags: List<String>
|
||||||
|
)
|
||||||
|
|
||||||
|
data class Book(
|
||||||
|
val authors: List<String>,
|
||||||
|
val isbn: String?,
|
||||||
|
val releaseDate: OffsetDateTime?,
|
||||||
val tags: List<String>
|
val tags: List<String>
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -154,14 +171,7 @@ data class Profile(
|
|||||||
val firstName: String?,
|
val firstName: String?,
|
||||||
val lastName: String?,
|
val lastName: String?,
|
||||||
val username: String?,
|
val username: String?,
|
||||||
val gender: String?
|
val gender: Gender?
|
||||||
)
|
|
||||||
|
|
||||||
data class Book(
|
|
||||||
val authors: List<String>,
|
|
||||||
val isbn: String?,
|
|
||||||
val releaseDate: String?,
|
|
||||||
val tags: List<String>
|
|
||||||
)
|
)
|
||||||
|
|
||||||
data class MusicSong(
|
data class MusicSong(
|
||||||
@@ -174,12 +184,16 @@ data class MusicSong(
|
|||||||
|
|
||||||
data class MusicAlbum(
|
data class MusicAlbum(
|
||||||
val songs: List<String>,
|
val songs: List<String>,
|
||||||
|
val songDisc: Int?,
|
||||||
|
val songTrack: Int?,
|
||||||
val musician: List<String>,
|
val musician: List<String>,
|
||||||
val releaseDate: String?
|
val releaseDate: OffsetDateTime?
|
||||||
)
|
)
|
||||||
|
|
||||||
data class MusicPlaylist(
|
data class MusicPlaylist(
|
||||||
val songs: List<String>,
|
val songs: List<String>,
|
||||||
|
val songDisc: Int?,
|
||||||
|
val songTrack: Int?,
|
||||||
val creator: String?
|
val creator: String?
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -192,7 +206,7 @@ data class VideoMovie(
|
|||||||
val director: List<String>,
|
val director: List<String>,
|
||||||
val writer: List<String>,
|
val writer: List<String>,
|
||||||
val duration: Int?,
|
val duration: Int?,
|
||||||
val releaseDate: String?,
|
val releaseDate: OffsetDateTime?,
|
||||||
val tags: List<String>
|
val tags: List<String>
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -201,7 +215,7 @@ data class VideoEpisode(
|
|||||||
val director: List<String>,
|
val director: List<String>,
|
||||||
val writer: List<String>,
|
val writer: List<String>,
|
||||||
val duration: Int?,
|
val duration: Int?,
|
||||||
val releaseDate: String?,
|
val releaseDate: OffsetDateTime?,
|
||||||
val tags: List<String>,
|
val tags: List<String>,
|
||||||
val series: String?
|
val series: String?
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -4,7 +4,10 @@ import org.jsoup.Jsoup
|
|||||||
import org.jsoup.nodes.Document
|
import org.jsoup.nodes.Document
|
||||||
import org.jsoup.select.Elements
|
import org.jsoup.select.Elements
|
||||||
import java.io.File
|
import java.io.File
|
||||||
|
import java.net.URI
|
||||||
import java.net.URL
|
import java.net.URL
|
||||||
|
import java.time.OffsetDateTime
|
||||||
|
import java.time.format.DateTimeParseException
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A comprehensive parser for Open Graph protocol tags.
|
* A comprehensive parser for Open Graph protocol tags.
|
||||||
@@ -17,11 +20,35 @@ import java.net.URL
|
|||||||
*/
|
*/
|
||||||
class Parser {
|
class Parser {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses a string in ISO 8601 format to an OffsetDateTime.
|
||||||
|
* Handles both date-only (YYYY-MM-DD) and date-time formats.
|
||||||
|
*
|
||||||
|
* @param dateTimeString The string to parse
|
||||||
|
* @return The parsed OffsetDateTime, or null if the string is null or cannot be parsed
|
||||||
|
*/
|
||||||
|
private fun parseDateTime(dateTimeString: String?): OffsetDateTime? {
|
||||||
|
if (dateTimeString == null) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
// Either parse full input or as date only
|
||||||
|
return try {
|
||||||
|
OffsetDateTime.parse(dateTimeString)
|
||||||
|
} catch (_: DateTimeParseException) {
|
||||||
|
try {
|
||||||
|
OffsetDateTime.parse(dateTimeString + "T00:00:00Z")
|
||||||
|
} catch (_: DateTimeParseException) {
|
||||||
|
null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts all Open Graph tags from a JSoup Document and returns a structured Data object.
|
* Extracts all Open Graph tags from a JSoup Document and returns a structured Data object.
|
||||||
*
|
*
|
||||||
* @param document The JSoup Document to parse
|
* @param document The JSoup Document to parse
|
||||||
* @return An Data object containing all extracted Open Graph data
|
* @return A Data object containing all extracted Open Graph data
|
||||||
*/
|
*/
|
||||||
fun parse(document: Document): Data {
|
fun parse(document: Document): Data {
|
||||||
val tags = document.select("meta[property^=og:]")
|
val tags = document.select("meta[property^=og:]")
|
||||||
@@ -100,7 +127,8 @@ class Parser {
|
|||||||
// Build basic properties
|
// Build basic properties
|
||||||
val title = getFirstTagContent(tags, "title")
|
val title = getFirstTagContent(tags, "title")
|
||||||
val type = getFirstTagContent(tags, "type")
|
val type = getFirstTagContent(tags, "type")
|
||||||
val url = getFirstTagContent(tags, "url")
|
val urlString = getFirstTagContent(tags, "url")
|
||||||
|
val url = urlString?.let{URI(urlString).toURL()}
|
||||||
val description = getFirstTagContent(tags, "description")
|
val description = getFirstTagContent(tags, "description")
|
||||||
val siteName = getFirstTagContent(tags, "site_name")
|
val siteName = getFirstTagContent(tags, "site_name")
|
||||||
val determiner = getFirstTagContent(tags, "determiner")
|
val determiner = getFirstTagContent(tags, "determiner")
|
||||||
@@ -329,9 +357,13 @@ class Parser {
|
|||||||
return null
|
return null
|
||||||
}
|
}
|
||||||
|
|
||||||
val publishedTime = articleTags.firstOrNull { it.property == "article:published_time" }?.content
|
val publishedTimeString = articleTags.firstOrNull { it.property == "article:published_time" }?.content
|
||||||
val modifiedTime = articleTags.firstOrNull { it.property == "article:modified_time" }?.content
|
val modifiedTimeString = articleTags.firstOrNull { it.property == "article:modified_time" }?.content
|
||||||
val expirationTime = articleTags.firstOrNull { it.property == "article:expiration_time" }?.content
|
val expirationTimeString = articleTags.firstOrNull { it.property == "article:expiration_time" }?.content
|
||||||
|
|
||||||
|
val publishedTime = parseDateTime(publishedTimeString)
|
||||||
|
val modifiedTime = parseDateTime(modifiedTimeString)
|
||||||
|
val expirationTime = parseDateTime(expirationTimeString)
|
||||||
val section = articleTags.firstOrNull { it.property == "article:section" }?.content
|
val section = articleTags.firstOrNull { it.property == "article:section" }?.content
|
||||||
val authors = articleTags.filter { it.property == "article:author" }.map { it.content }
|
val authors = articleTags.filter { it.property == "article:author" }.map { it.content }
|
||||||
val tags = articleTags.filter { it.property == "article:tag" }.map { it.content }
|
val tags = articleTags.filter { it.property == "article:tag" }.map { it.content }
|
||||||
@@ -350,7 +382,7 @@ class Parser {
|
|||||||
* Builds an Profile object from profile-related tags.
|
* Builds an Profile object from profile-related tags.
|
||||||
*
|
*
|
||||||
* @param groupedTags The map of grouped Tag objects
|
* @param groupedTags The map of grouped Tag objects
|
||||||
* @return An Profile object, or null if no profile tags are found
|
* @return A Profile object, or null if no profile tags are found
|
||||||
*/
|
*/
|
||||||
private fun buildProfile(groupedTags: Map<String, List<Tag>>): Profile? {
|
private fun buildProfile(groupedTags: Map<String, List<Tag>>): Profile? {
|
||||||
val profileTags = groupedTags.getOrDefault("profile", emptyList())
|
val profileTags = groupedTags.getOrDefault("profile", emptyList())
|
||||||
@@ -362,7 +394,8 @@ class Parser {
|
|||||||
val firstName = profileTags.firstOrNull { it.property == "profile:first_name" }?.content
|
val firstName = profileTags.firstOrNull { it.property == "profile:first_name" }?.content
|
||||||
val lastName = profileTags.firstOrNull { it.property == "profile:last_name" }?.content
|
val lastName = profileTags.firstOrNull { it.property == "profile:last_name" }?.content
|
||||||
val username = profileTags.firstOrNull { it.property == "profile:username" }?.content
|
val username = profileTags.firstOrNull { it.property == "profile:username" }?.content
|
||||||
val gender = profileTags.firstOrNull { it.property == "profile:gender" }?.content
|
val genderString = profileTags.firstOrNull { it.property == "profile:gender" }?.content
|
||||||
|
val gender = genderString?.let(Gender::fromString)
|
||||||
|
|
||||||
return Profile(
|
return Profile(
|
||||||
firstName = firstName,
|
firstName = firstName,
|
||||||
@@ -387,7 +420,8 @@ class Parser {
|
|||||||
|
|
||||||
val authors = bookTags.filter { it.property == "book:author" }.map { it.content }
|
val authors = bookTags.filter { it.property == "book:author" }.map { it.content }
|
||||||
val isbn = bookTags.firstOrNull { it.property == "book:isbn" }?.content
|
val isbn = bookTags.firstOrNull { it.property == "book:isbn" }?.content
|
||||||
val releaseDate = bookTags.firstOrNull { it.property == "book:release_date" }?.content
|
val releaseDateString = bookTags.firstOrNull { it.property == "book:release_date" }?.content
|
||||||
|
val releaseDate = parseDateTime(releaseDateString)
|
||||||
val tags = bookTags.filter { it.property == "book:tag" }.map { it.content }
|
val tags = bookTags.filter { it.property == "book:tag" }.map { it.content }
|
||||||
|
|
||||||
return Book(
|
return Book(
|
||||||
@@ -440,18 +474,23 @@ class Parser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
val songs = musicTags.filter { it.property == "music:song" }.map { it.content }
|
val songs = musicTags.filter { it.property == "music:song" }.map { it.content }
|
||||||
|
val songDisc = musicTags.firstOrNull { it.property == "music:song:disc" }?.content?.toIntOrNull()
|
||||||
|
val songTrack = musicTags.firstOrNull { it.property == "music:song:track" }?.content?.toIntOrNull()
|
||||||
val musicians = musicTags.filter { it.property == "music:musician" }.map { it.content }
|
val musicians = musicTags.filter { it.property == "music:musician" }.map { it.content }
|
||||||
val releaseDate = musicTags.firstOrNull { it.property == "music:release_date" }?.content
|
val releaseDateString = musicTags.firstOrNull { it.property == "music:release_date" }?.content
|
||||||
|
val releaseDate = parseDateTime(releaseDateString)
|
||||||
|
|
||||||
return MusicAlbum(
|
return MusicAlbum(
|
||||||
songs = songs,
|
songs = songs,
|
||||||
|
songDisc = songDisc,
|
||||||
|
songTrack = songTrack,
|
||||||
musician = musicians,
|
musician = musicians,
|
||||||
releaseDate = releaseDate
|
releaseDate = releaseDate
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an MusicPlaylist object from music.playlist-related tags.
|
* Builds a MusicPlaylist object from music.playlist-related tags.
|
||||||
*
|
*
|
||||||
* @param groupedTags The map of grouped Tag objects
|
* @param groupedTags The map of grouped Tag objects
|
||||||
* @return An MusicPlaylist object, or null if no music.playlist tags are found
|
* @return An MusicPlaylist object, or null if no music.playlist tags are found
|
||||||
@@ -464,16 +503,20 @@ class Parser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
val songs = musicTags.filter { it.property == "music:song" }.map { it.content }
|
val songs = musicTags.filter { it.property == "music:song" }.map { it.content }
|
||||||
|
val songDisc = musicTags.firstOrNull { it.property == "music:song:disc" }?.content?.toIntOrNull()
|
||||||
|
val songTrack = musicTags.firstOrNull { it.property == "music:song:track" }?.content?.toIntOrNull()
|
||||||
val creator = musicTags.firstOrNull { it.property == "music:creator" }?.content
|
val creator = musicTags.firstOrNull { it.property == "music:creator" }?.content
|
||||||
|
|
||||||
return MusicPlaylist(
|
return MusicPlaylist(
|
||||||
songs = songs,
|
songs = songs,
|
||||||
|
songDisc = songDisc,
|
||||||
|
songTrack = songTrack,
|
||||||
creator = creator
|
creator = creator
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an MusicRadioStation object from music.radio_station-related tags.
|
* Builds a MusicRadioStation object from music.radio_station-related tags.
|
||||||
*
|
*
|
||||||
* @param groupedTags The map of grouped Tag objects
|
* @param groupedTags The map of grouped Tag objects
|
||||||
* @return An MusicRadioStation object, or null if no music.radio_station tags are found
|
* @return An MusicRadioStation object, or null if no music.radio_station tags are found
|
||||||
@@ -509,7 +552,8 @@ class Parser {
|
|||||||
val directors = videoTags.filter { it.property == "video:director" }.map { it.content }
|
val directors = videoTags.filter { it.property == "video:director" }.map { it.content }
|
||||||
val writers = videoTags.filter { it.property == "video:writer" }.map { it.content }
|
val writers = videoTags.filter { it.property == "video:writer" }.map { it.content }
|
||||||
val duration = videoTags.firstOrNull { it.property == "video:duration" }?.content?.toIntOrNull()
|
val duration = videoTags.firstOrNull { it.property == "video:duration" }?.content?.toIntOrNull()
|
||||||
val releaseDate = videoTags.firstOrNull { it.property == "video:release_date" }?.content
|
val releaseDateString = videoTags.firstOrNull { it.property == "video:release_date" }?.content
|
||||||
|
val releaseDate = parseDateTime(releaseDateString)
|
||||||
val tags = videoTags.filter { it.property == "video:tag" }.map { it.content }
|
val tags = videoTags.filter { it.property == "video:tag" }.map { it.content }
|
||||||
|
|
||||||
return VideoMovie(
|
return VideoMovie(
|
||||||
@@ -539,7 +583,8 @@ class Parser {
|
|||||||
val directors = videoTags.filter { it.property == "video:director" }.map { it.content }
|
val directors = videoTags.filter { it.property == "video:director" }.map { it.content }
|
||||||
val writers = videoTags.filter { it.property == "video:writer" }.map { it.content }
|
val writers = videoTags.filter { it.property == "video:writer" }.map { it.content }
|
||||||
val duration = videoTags.firstOrNull { it.property == "video:duration" }?.content?.toIntOrNull()
|
val duration = videoTags.firstOrNull { it.property == "video:duration" }?.content?.toIntOrNull()
|
||||||
val releaseDate = videoTags.firstOrNull { it.property == "video:release_date" }?.content
|
val releaseDateString = videoTags.firstOrNull { it.property == "video:release_date" }?.content
|
||||||
|
val releaseDate = parseDateTime(releaseDateString)
|
||||||
val tags = videoTags.filter { it.property == "video:tag" }.map { it.content }
|
val tags = videoTags.filter { it.property == "video:tag" }.map { it.content }
|
||||||
val series = videoTags.firstOrNull { it.property == "video:series" }?.content
|
val series = videoTags.firstOrNull { it.property == "video:series" }?.content
|
||||||
|
|
||||||
@@ -553,4 +598,4 @@ class Parser {
|
|||||||
series = series
|
series = series
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,10 +3,18 @@ package fr.lengrand.opengraphkt
|
|||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
import org.junit.jupiter.api.io.TempDir
|
import org.junit.jupiter.api.io.TempDir
|
||||||
import java.io.File
|
import java.io.File
|
||||||
|
import java.net.URL
|
||||||
|
import java.time.OffsetDateTime
|
||||||
import kotlin.test.assertEquals
|
import kotlin.test.assertEquals
|
||||||
import kotlin.test.assertNotNull
|
import kotlin.test.assertNotNull
|
||||||
import kotlin.test.assertTrue
|
import kotlin.test.assertTrue
|
||||||
|
|
||||||
|
// Helper function to compare URL objects with String URLs
|
||||||
|
private fun assertUrlEquals(expected: String, actual: URL?) {
|
||||||
|
assertNotNull(actual)
|
||||||
|
assertEquals(expected, actual.toString())
|
||||||
|
}
|
||||||
|
|
||||||
class ParserTest {
|
class ParserTest {
|
||||||
|
|
||||||
private val parser = Parser()
|
private val parser = Parser()
|
||||||
@@ -21,6 +29,8 @@ class ParserTest {
|
|||||||
<meta property="og:type" content="video.movie" />
|
<meta property="og:type" content="video.movie" />
|
||||||
<meta property="og:url" content="https://example.com/the-rock" />
|
<meta property="og:url" content="https://example.com/the-rock" />
|
||||||
<meta property="og:image" content="https://example.com/rock.jpg" />
|
<meta property="og:image" content="https://example.com/rock.jpg" />
|
||||||
|
<meta property="og:image:secure_url" content="https://secure.example.com/rock.jpg" />
|
||||||
|
<meta property="og:image:type" content="image/jpeg" />
|
||||||
<meta property="og:image:width" content="300" />
|
<meta property="og:image:width" content="300" />
|
||||||
<meta property="og:image:height" content="200" />
|
<meta property="og:image:height" content="200" />
|
||||||
<meta property="og:image:alt" content="A promotional image for The Rock" />
|
<meta property="og:image:alt" content="A promotional image for The Rock" />
|
||||||
@@ -122,12 +132,18 @@ class ParserTest {
|
|||||||
<meta property="og:type" content="website" />
|
<meta property="og:type" content="website" />
|
||||||
<meta property="og:url" content="https://example.com/gallery" />
|
<meta property="og:url" content="https://example.com/gallery" />
|
||||||
<meta property="og:image" content="https://example.com/image1.jpg" />
|
<meta property="og:image" content="https://example.com/image1.jpg" />
|
||||||
|
<meta property="og:image:secure_url" content="https://secure.example.com/image1.jpg" />
|
||||||
|
<meta property="og:image:type" content="image/jpeg" />
|
||||||
<meta property="og:image:width" content="800" />
|
<meta property="og:image:width" content="800" />
|
||||||
<meta property="og:image:height" content="600" />
|
<meta property="og:image:height" content="600" />
|
||||||
<meta property="og:image" content="https://example.com/image2.jpg" />
|
<meta property="og:image" content="https://example.com/image2.jpg" />
|
||||||
|
<meta property="og:image:secure_url" content="https://secure.example.com/image2.jpg" />
|
||||||
|
<meta property="og:image:type" content="image/png" />
|
||||||
<meta property="og:image:width" content="1024" />
|
<meta property="og:image:width" content="1024" />
|
||||||
<meta property="og:image:height" content="768" />
|
<meta property="og:image:height" content="768" />
|
||||||
<meta property="og:image" content="https://example.com/image3.jpg" />
|
<meta property="og:image" content="https://example.com/image3.jpg" />
|
||||||
|
<meta property="og:image:secure_url" content="https://secure.example.com/image3.jpg" />
|
||||||
|
<meta property="og:image:type" content="image/gif" />
|
||||||
<meta property="og:image:width" content="1200" />
|
<meta property="og:image:width" content="1200" />
|
||||||
<meta property="og:image:height" content="900" />
|
<meta property="og:image:height" content="900" />
|
||||||
<meta property="og:description" content="A gallery of images" />
|
<meta property="og:description" content="A gallery of images" />
|
||||||
@@ -145,18 +161,20 @@ class ParserTest {
|
|||||||
// Verify that all required properties are extracted correctly
|
// Verify that all required properties are extracted correctly
|
||||||
assertEquals("The Rock", openGraphData.title)
|
assertEquals("The Rock", openGraphData.title)
|
||||||
assertEquals("video.movie", openGraphData.type)
|
assertEquals("video.movie", openGraphData.type)
|
||||||
assertEquals("https://example.com/the-rock", openGraphData.url)
|
assertUrlEquals("https://example.com/the-rock", openGraphData.url)
|
||||||
|
|
||||||
// Verify that the OpenGraphData object is valid
|
// Verify that the OpenGraphData object is valid
|
||||||
assertTrue(openGraphData.isValid())
|
assertTrue(openGraphData.isValid())
|
||||||
|
|
||||||
// Verify that all tags are extracted
|
// Verify that all tags are extracted
|
||||||
assertEquals(18, openGraphData.tags.size)
|
assertEquals(20, openGraphData.tags.size)
|
||||||
|
|
||||||
// Verify image properties
|
// Verify image properties
|
||||||
assertEquals(1, openGraphData.images.size)
|
assertEquals(1, openGraphData.images.size)
|
||||||
val image = openGraphData.images[0]
|
val image = openGraphData.images[0]
|
||||||
assertEquals("https://example.com/rock.jpg", image.url)
|
assertEquals("https://example.com/rock.jpg", image.url)
|
||||||
|
assertEquals("https://secure.example.com/rock.jpg", image.secureUrl)
|
||||||
|
assertEquals("image/jpeg", image.type)
|
||||||
assertEquals(300, image.width)
|
assertEquals(300, image.width)
|
||||||
assertEquals(200, image.height)
|
assertEquals(200, image.height)
|
||||||
assertEquals("A promotional image for The Rock", image.alt)
|
assertEquals("A promotional image for The Rock", image.alt)
|
||||||
@@ -198,13 +216,15 @@ class ParserTest {
|
|||||||
// Verify basic properties
|
// Verify basic properties
|
||||||
assertEquals("Breaking News", openGraphData.title)
|
assertEquals("Breaking News", openGraphData.title)
|
||||||
assertEquals("article", openGraphData.type)
|
assertEquals("article", openGraphData.type)
|
||||||
assertEquals("https://example.com/news/breaking", openGraphData.url)
|
assertUrlEquals("https://example.com/news/breaking", openGraphData.url)
|
||||||
assertEquals("Latest breaking news", openGraphData.description)
|
assertEquals("Latest breaking news", openGraphData.description)
|
||||||
|
|
||||||
// Verify article-specific properties
|
// Verify article-specific properties
|
||||||
assertNotNull(openGraphData.article)
|
assertNotNull(openGraphData.article)
|
||||||
assertEquals("2023-01-01T00:00:00Z", openGraphData.article.publishedTime)
|
assertNotNull(openGraphData.article.publishedTime)
|
||||||
assertEquals("2023-01-02T12:00:00Z", openGraphData.article.modifiedTime)
|
assertEquals(OffsetDateTime.parse("2023-01-01T00:00:00Z"), openGraphData.article.publishedTime)
|
||||||
|
assertNotNull(openGraphData.article.modifiedTime)
|
||||||
|
assertEquals(OffsetDateTime.parse("2023-01-02T12:00:00Z"), openGraphData.article.modifiedTime)
|
||||||
assertEquals("News", openGraphData.article.section)
|
assertEquals("News", openGraphData.article.section)
|
||||||
assertEquals(2, openGraphData.article.authors.size)
|
assertEquals(2, openGraphData.article.authors.size)
|
||||||
assertTrue(openGraphData.article.authors.contains("John Doe"))
|
assertTrue(openGraphData.article.authors.contains("John Doe"))
|
||||||
@@ -221,7 +241,7 @@ class ParserTest {
|
|||||||
// Verify basic properties
|
// Verify basic properties
|
||||||
assertEquals("John Doe", openGraphData.title)
|
assertEquals("John Doe", openGraphData.title)
|
||||||
assertEquals("profile", openGraphData.type)
|
assertEquals("profile", openGraphData.type)
|
||||||
assertEquals("https://example.com/profile/johndoe", openGraphData.url)
|
assertUrlEquals("https://example.com/profile/johndoe", openGraphData.url)
|
||||||
assertEquals("John Doe's profile", openGraphData.description)
|
assertEquals("John Doe's profile", openGraphData.description)
|
||||||
|
|
||||||
// Verify profile-specific properties
|
// Verify profile-specific properties
|
||||||
@@ -229,7 +249,7 @@ class ParserTest {
|
|||||||
assertEquals("John", openGraphData.profile.firstName)
|
assertEquals("John", openGraphData.profile.firstName)
|
||||||
assertEquals("Doe", openGraphData.profile.lastName)
|
assertEquals("Doe", openGraphData.profile.lastName)
|
||||||
assertEquals("johndoe", openGraphData.profile.username)
|
assertEquals("johndoe", openGraphData.profile.username)
|
||||||
assertEquals("male", openGraphData.profile.gender)
|
assertEquals(Gender.MALE, openGraphData.profile.gender)
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -239,7 +259,7 @@ class ParserTest {
|
|||||||
// Verify basic properties
|
// Verify basic properties
|
||||||
assertEquals("The Great Novel", openGraphData.title)
|
assertEquals("The Great Novel", openGraphData.title)
|
||||||
assertEquals("book", openGraphData.type)
|
assertEquals("book", openGraphData.type)
|
||||||
assertEquals("https://example.com/books/great-novel", openGraphData.url)
|
assertUrlEquals("https://example.com/books/great-novel", openGraphData.url)
|
||||||
assertEquals("A great novel", openGraphData.description)
|
assertEquals("A great novel", openGraphData.description)
|
||||||
|
|
||||||
// Verify book-specific properties
|
// Verify book-specific properties
|
||||||
@@ -247,7 +267,8 @@ class ParserTest {
|
|||||||
assertEquals(1, openGraphData.book.authors.size)
|
assertEquals(1, openGraphData.book.authors.size)
|
||||||
assertEquals("Famous Author", openGraphData.book.authors.get(0))
|
assertEquals("Famous Author", openGraphData.book.authors.get(0))
|
||||||
assertEquals("1234567890123", openGraphData.book.isbn)
|
assertEquals("1234567890123", openGraphData.book.isbn)
|
||||||
assertEquals("2023-01-01", openGraphData.book.releaseDate)
|
assertNotNull(openGraphData.book.releaseDate)
|
||||||
|
assertEquals(OffsetDateTime.parse("2023-01-01T00:00:00Z"), openGraphData.book.releaseDate)
|
||||||
assertEquals(2, openGraphData.book.tags.size)
|
assertEquals(2, openGraphData.book.tags.size)
|
||||||
assertTrue(openGraphData.book.tags.contains("fiction"))
|
assertTrue(openGraphData.book.tags.contains("fiction"))
|
||||||
assertTrue(openGraphData.book.tags.contains("novel"))
|
assertTrue(openGraphData.book.tags.contains("novel"))
|
||||||
@@ -260,7 +281,7 @@ class ParserTest {
|
|||||||
// Verify basic properties
|
// Verify basic properties
|
||||||
assertEquals("Photo Gallery", openGraphData.title)
|
assertEquals("Photo Gallery", openGraphData.title)
|
||||||
assertEquals("website", openGraphData.type)
|
assertEquals("website", openGraphData.type)
|
||||||
assertEquals("https://example.com/gallery", openGraphData.url)
|
assertUrlEquals("https://example.com/gallery", openGraphData.url)
|
||||||
assertEquals("A gallery of images", openGraphData.description)
|
assertEquals("A gallery of images", openGraphData.description)
|
||||||
|
|
||||||
// Verify multiple images
|
// Verify multiple images
|
||||||
@@ -268,16 +289,22 @@ class ParserTest {
|
|||||||
|
|
||||||
// First image
|
// First image
|
||||||
assertEquals("https://example.com/image1.jpg", openGraphData.images[0].url)
|
assertEquals("https://example.com/image1.jpg", openGraphData.images[0].url)
|
||||||
|
assertEquals("https://secure.example.com/image1.jpg", openGraphData.images[0].secureUrl)
|
||||||
|
assertEquals("image/jpeg", openGraphData.images[0].type)
|
||||||
assertEquals(800, openGraphData.images[0].width)
|
assertEquals(800, openGraphData.images[0].width)
|
||||||
assertEquals(600, openGraphData.images[0].height)
|
assertEquals(600, openGraphData.images[0].height)
|
||||||
|
|
||||||
// Second image
|
// Second image
|
||||||
assertEquals("https://example.com/image2.jpg", openGraphData.images[1].url)
|
assertEquals("https://example.com/image2.jpg", openGraphData.images[1].url)
|
||||||
|
assertEquals("https://secure.example.com/image2.jpg", openGraphData.images[1].secureUrl)
|
||||||
|
assertEquals("image/png", openGraphData.images[1].type)
|
||||||
assertEquals(1024, openGraphData.images[1].width)
|
assertEquals(1024, openGraphData.images[1].width)
|
||||||
assertEquals(768, openGraphData.images[1].height)
|
assertEquals(768, openGraphData.images[1].height)
|
||||||
|
|
||||||
// Third image
|
// Third image
|
||||||
assertEquals("https://example.com/image3.jpg", openGraphData.images[2].url)
|
assertEquals("https://example.com/image3.jpg", openGraphData.images[2].url)
|
||||||
|
assertEquals("https://secure.example.com/image3.jpg", openGraphData.images[2].secureUrl)
|
||||||
|
assertEquals("image/gif", openGraphData.images[2].type)
|
||||||
assertEquals(1200, openGraphData.images[2].width)
|
assertEquals(1200, openGraphData.images[2].width)
|
||||||
assertEquals(900, openGraphData.images[2].height)
|
assertEquals(900, openGraphData.images[2].height)
|
||||||
}
|
}
|
||||||
@@ -293,13 +320,15 @@ class ParserTest {
|
|||||||
// Verify basic properties
|
// Verify basic properties
|
||||||
assertEquals("Breaking News", openGraphData.title)
|
assertEquals("Breaking News", openGraphData.title)
|
||||||
assertEquals("article", openGraphData.type)
|
assertEquals("article", openGraphData.type)
|
||||||
assertEquals("https://example.com/news/breaking", openGraphData.url)
|
assertUrlEquals("https://example.com/news/breaking", openGraphData.url)
|
||||||
assertEquals("Latest breaking news", openGraphData.description)
|
assertEquals("Latest breaking news", openGraphData.description)
|
||||||
|
|
||||||
// Verify article-specific properties
|
// Verify article-specific properties
|
||||||
assertNotNull(openGraphData.article)
|
assertNotNull(openGraphData.article)
|
||||||
assertEquals("2023-01-01T00:00:00Z", openGraphData.article.publishedTime)
|
assertNotNull(openGraphData.article.publishedTime)
|
||||||
assertEquals("2023-01-02T12:00:00Z", openGraphData.article.modifiedTime)
|
assertEquals(OffsetDateTime.parse("2023-01-01T00:00:00Z"), openGraphData.article.publishedTime)
|
||||||
|
assertNotNull(openGraphData.article.modifiedTime)
|
||||||
|
assertEquals(OffsetDateTime.parse("2023-01-02T12:00:00Z"), openGraphData.article.modifiedTime)
|
||||||
assertEquals("News", openGraphData.article.section)
|
assertEquals("News", openGraphData.article.section)
|
||||||
assertEquals(2, openGraphData.article.authors.size)
|
assertEquals(2, openGraphData.article.authors.size)
|
||||||
assertTrue(openGraphData.article.authors.contains("John Doe"))
|
assertTrue(openGraphData.article.authors.contains("John Doe"))
|
||||||
@@ -368,7 +397,7 @@ class ParserTest {
|
|||||||
// Verify basic properties
|
// Verify basic properties
|
||||||
assertEquals("The Matrix", openGraphData.title)
|
assertEquals("The Matrix", openGraphData.title)
|
||||||
assertEquals("video.movie", openGraphData.type)
|
assertEquals("video.movie", openGraphData.type)
|
||||||
assertEquals("https://example.com/movies/the-matrix", openGraphData.url)
|
assertUrlEquals("https://example.com/movies/the-matrix", openGraphData.url)
|
||||||
assertEquals("A sci-fi action movie", openGraphData.description)
|
assertEquals("A sci-fi action movie", openGraphData.description)
|
||||||
|
|
||||||
// Verify video.movie-specific properties
|
// Verify video.movie-specific properties
|
||||||
@@ -383,12 +412,76 @@ class ParserTest {
|
|||||||
assertTrue(openGraphData.videoMovie.writer.contains("Lana Wachowski"))
|
assertTrue(openGraphData.videoMovie.writer.contains("Lana Wachowski"))
|
||||||
assertTrue(openGraphData.videoMovie.writer.contains("Lilly Wachowski"))
|
assertTrue(openGraphData.videoMovie.writer.contains("Lilly Wachowski"))
|
||||||
assertEquals(136, openGraphData.videoMovie.duration)
|
assertEquals(136, openGraphData.videoMovie.duration)
|
||||||
assertEquals("1999-03-31", openGraphData.videoMovie.releaseDate)
|
assertNotNull(openGraphData.videoMovie.releaseDate)
|
||||||
|
assertEquals(OffsetDateTime.parse("1999-03-31T00:00:00Z"), openGraphData.videoMovie.releaseDate)
|
||||||
assertEquals(2, openGraphData.videoMovie.tags.size)
|
assertEquals(2, openGraphData.videoMovie.tags.size)
|
||||||
assertTrue(openGraphData.videoMovie.tags.contains("sci-fi"))
|
assertTrue(openGraphData.videoMovie.tags.contains("sci-fi"))
|
||||||
assertTrue(openGraphData.videoMovie.tags.contains("action"))
|
assertTrue(openGraphData.videoMovie.tags.contains("action"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sample HTML with music.album-specific tags
|
||||||
|
private val musicAlbumHtml = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Music Album Example</title>
|
||||||
|
<meta property="og:title" content="Greatest Hits" />
|
||||||
|
<meta property="og:type" content="music.album" />
|
||||||
|
<meta property="og:url" content="https://example.com/albums/greatest-hits" />
|
||||||
|
<meta property="og:image" content="https://example.com/album-cover.jpg" />
|
||||||
|
<meta property="og:description" content="A collection of greatest hits" />
|
||||||
|
<meta property="og:music:song" content="Song 1" />
|
||||||
|
<meta property="og:music:song" content="Song 2" />
|
||||||
|
<meta property="og:music:song:disc" content="1" />
|
||||||
|
<meta property="og:music:song:track" content="1" />
|
||||||
|
<meta property="og:music:musician" content="Famous Musician" />
|
||||||
|
<meta property="og:music:musician" content="Another Musician" />
|
||||||
|
<meta property="og:music:release_date" content="2023-01-15T12:30:00Z" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Greatest Hits</h1>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""".trimIndent()
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `test parse with music album-specific tags`() {
|
||||||
|
val openGraphData = parser.parse(musicAlbumHtml)
|
||||||
|
|
||||||
|
// Verify basic properties
|
||||||
|
assertEquals("Greatest Hits", openGraphData.title)
|
||||||
|
assertEquals("music.album", openGraphData.type)
|
||||||
|
assertUrlEquals("https://example.com/albums/greatest-hits", openGraphData.url)
|
||||||
|
assertEquals("A collection of greatest hits", openGraphData.description)
|
||||||
|
|
||||||
|
// Verify music.album-specific properties
|
||||||
|
assertNotNull(openGraphData.musicAlbum)
|
||||||
|
assertEquals(2, openGraphData.musicAlbum.songs.size)
|
||||||
|
assertTrue(openGraphData.musicAlbum.songs.contains("Song 1"))
|
||||||
|
assertTrue(openGraphData.musicAlbum.songs.contains("Song 2"))
|
||||||
|
assertEquals(1, openGraphData.musicAlbum.songDisc)
|
||||||
|
assertEquals(1, openGraphData.musicAlbum.songTrack)
|
||||||
|
assertEquals(2, openGraphData.musicAlbum.musician.size)
|
||||||
|
assertTrue(openGraphData.musicAlbum.musician.contains("Famous Musician"))
|
||||||
|
assertTrue(openGraphData.musicAlbum.musician.contains("Another Musician"))
|
||||||
|
|
||||||
|
// Verify releaseDate is correctly parsed as OffsetDateTime
|
||||||
|
assertNotNull(openGraphData.musicAlbum.releaseDate)
|
||||||
|
assertEquals(OffsetDateTime.parse("2023-01-15T12:30:00Z"), openGraphData.musicAlbum.releaseDate)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `test parse with date-only release date`() {
|
||||||
|
// Create a modified version of the music album HTML with a date-only release date
|
||||||
|
val dateOnlyHtml = musicAlbumHtml.replace("2023-01-15T12:30:00Z", "2023-01-15")
|
||||||
|
val openGraphData = parser.parse(dateOnlyHtml)
|
||||||
|
|
||||||
|
// Verify releaseDate is correctly parsed as OffsetDateTime with default time
|
||||||
|
assertNotNull(openGraphData.musicAlbum)
|
||||||
|
assertNotNull(openGraphData.musicAlbum.releaseDate)
|
||||||
|
assertEquals(OffsetDateTime.parse("2023-01-15T00:00:00Z"), openGraphData.musicAlbum.releaseDate)
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
fun `test getType method returns correct enum values`() {
|
fun `test getType method returns correct enum values`() {
|
||||||
// Test video.movie type
|
// Test video.movie type
|
||||||
@@ -407,6 +500,10 @@ class ParserTest {
|
|||||||
val bookData = parser.parse(bookHtml)
|
val bookData = parser.parse(bookHtml)
|
||||||
assertEquals(Type.BOOK, bookData.getType())
|
assertEquals(Type.BOOK, bookData.getType())
|
||||||
|
|
||||||
|
// Test music.album type
|
||||||
|
val musicAlbumData = parser.parse(musicAlbumHtml)
|
||||||
|
assertEquals(Type.MUSIC_ALBUM, musicAlbumData.getType())
|
||||||
|
|
||||||
// Test website type (should return UNKNOWN as it's not in our enum)
|
// Test website type (should return UNKNOWN as it's not in our enum)
|
||||||
val websiteData = parser.parse(multipleImagesHtml)
|
val websiteData = parser.parse(multipleImagesHtml)
|
||||||
assertEquals(Type.WEBSITE, websiteData.getType())
|
assertEquals(Type.WEBSITE, websiteData.getType())
|
||||||
@@ -419,4 +516,196 @@ class ParserTest {
|
|||||||
val unkwownData = parser.parse(unknownTypeHtml)
|
val unkwownData = parser.parse(unknownTypeHtml)
|
||||||
assertEquals(Type.UNKNOWN, unkwownData.getType())
|
assertEquals(Type.UNKNOWN, unkwownData.getType())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sample HTML with music.song-specific tags
|
||||||
|
private val musicSongHtml = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Music Song Example</title>
|
||||||
|
<meta property="og:title" content="Awesome Song" />
|
||||||
|
<meta property="og:type" content="music.song" />
|
||||||
|
<meta property="og:url" content="https://example.com/songs/awesome-song" />
|
||||||
|
<meta property="og:image" content="https://example.com/song-cover.jpg" />
|
||||||
|
<meta property="og:description" content="An awesome song" />
|
||||||
|
<meta property="og:music:duration" content="240" />
|
||||||
|
<meta property="og:music:album" content="Awesome Album" />
|
||||||
|
<meta property="og:music:album:disc" content="1" />
|
||||||
|
<meta property="og:music:album:track" content="3" />
|
||||||
|
<meta property="og:music:musician" content="Awesome Artist" />
|
||||||
|
<meta property="og:music:musician" content="Featured Artist" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Awesome Song</h1>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""".trimIndent()
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `test parse with music song-specific tags`() {
|
||||||
|
val openGraphData = parser.parse(musicSongHtml)
|
||||||
|
|
||||||
|
// Verify basic properties
|
||||||
|
assertEquals("Awesome Song", openGraphData.title)
|
||||||
|
assertEquals("music.song", openGraphData.type)
|
||||||
|
assertUrlEquals("https://example.com/songs/awesome-song", openGraphData.url)
|
||||||
|
assertEquals("An awesome song", openGraphData.description)
|
||||||
|
|
||||||
|
// Verify music.song-specific properties
|
||||||
|
assertNotNull(openGraphData.musicSong)
|
||||||
|
assertEquals(240, openGraphData.musicSong.duration)
|
||||||
|
assertEquals("Awesome Album", openGraphData.musicSong.album)
|
||||||
|
assertEquals(1, openGraphData.musicSong.albumDisc)
|
||||||
|
assertEquals(3, openGraphData.musicSong.albumTrack)
|
||||||
|
assertEquals(2, openGraphData.musicSong.musician.size)
|
||||||
|
assertTrue(openGraphData.musicSong.musician.contains("Awesome Artist"))
|
||||||
|
assertTrue(openGraphData.musicSong.musician.contains("Featured Artist"))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sample HTML with music.playlist-specific tags
|
||||||
|
private val musicPlaylistHtml = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Music Playlist Example</title>
|
||||||
|
<meta property="og:title" content="Awesome Playlist" />
|
||||||
|
<meta property="og:type" content="music.playlist" />
|
||||||
|
<meta property="og:url" content="https://example.com/playlists/awesome-playlist" />
|
||||||
|
<meta property="og:image" content="https://example.com/playlist-cover.jpg" />
|
||||||
|
<meta property="og:description" content="An awesome playlist" />
|
||||||
|
<meta property="og:music:song" content="Song 1" />
|
||||||
|
<meta property="og:music:song" content="Song 2" />
|
||||||
|
<meta property="og:music:song" content="Song 3" />
|
||||||
|
<meta property="og:music:song:disc" content="1" />
|
||||||
|
<meta property="og:music:song:track" content="1" />
|
||||||
|
<meta property="og:music:creator" content="Playlist Creator" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Awesome Playlist</h1>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""".trimIndent()
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `test parse with music playlist-specific tags`() {
|
||||||
|
val openGraphData = parser.parse(musicPlaylistHtml)
|
||||||
|
|
||||||
|
// Verify basic properties
|
||||||
|
assertEquals("Awesome Playlist", openGraphData.title)
|
||||||
|
assertEquals("music.playlist", openGraphData.type)
|
||||||
|
assertUrlEquals("https://example.com/playlists/awesome-playlist", openGraphData.url)
|
||||||
|
assertEquals("An awesome playlist", openGraphData.description)
|
||||||
|
|
||||||
|
// Verify music.playlist-specific properties
|
||||||
|
assertNotNull(openGraphData.musicPlaylist)
|
||||||
|
assertEquals(3, openGraphData.musicPlaylist.songs.size)
|
||||||
|
assertTrue(openGraphData.musicPlaylist.songs.contains("Song 1"))
|
||||||
|
assertTrue(openGraphData.musicPlaylist.songs.contains("Song 2"))
|
||||||
|
assertTrue(openGraphData.musicPlaylist.songs.contains("Song 3"))
|
||||||
|
assertEquals(1, openGraphData.musicPlaylist.songDisc)
|
||||||
|
assertEquals(1, openGraphData.musicPlaylist.songTrack)
|
||||||
|
assertEquals("Playlist Creator", openGraphData.musicPlaylist.creator)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sample HTML with music.radio_station-specific tags
|
||||||
|
private val musicRadioStationHtml = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Music Radio Station Example</title>
|
||||||
|
<meta property="og:title" content="Awesome Radio" />
|
||||||
|
<meta property="og:type" content="music.radio_station" />
|
||||||
|
<meta property="og:url" content="https://example.com/radio/awesome-radio" />
|
||||||
|
<meta property="og:image" content="https://example.com/radio-logo.jpg" />
|
||||||
|
<meta property="og:description" content="An awesome radio station" />
|
||||||
|
<meta property="og:music:creator" content="Radio Creator" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Awesome Radio</h1>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""".trimIndent()
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `test parse with music radio station-specific tags`() {
|
||||||
|
val openGraphData = parser.parse(musicRadioStationHtml)
|
||||||
|
|
||||||
|
// Verify basic properties
|
||||||
|
assertEquals("Awesome Radio", openGraphData.title)
|
||||||
|
assertEquals("music.radio_station", openGraphData.type)
|
||||||
|
assertUrlEquals("https://example.com/radio/awesome-radio", openGraphData.url)
|
||||||
|
assertEquals("An awesome radio station", openGraphData.description)
|
||||||
|
|
||||||
|
// Verify music.radio_station-specific properties
|
||||||
|
assertNotNull(openGraphData.musicRadioStation)
|
||||||
|
assertEquals("Radio Creator", openGraphData.musicRadioStation.creator)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sample HTML with video.episode-specific tags
|
||||||
|
private val videoEpisodeHtml = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Video Episode Example</title>
|
||||||
|
<meta property="og:title" content="Awesome Episode" />
|
||||||
|
<meta property="og:type" content="video.episode" />
|
||||||
|
<meta property="og:url" content="https://example.com/episodes/awesome-episode" />
|
||||||
|
<meta property="og:image" content="https://example.com/episode-thumbnail.jpg" />
|
||||||
|
<meta property="og:description" content="An awesome episode" />
|
||||||
|
<meta property="og:video:actor" content="Actor 1" />
|
||||||
|
<meta property="og:video:actor" content="Actor 2" />
|
||||||
|
<meta property="og:video:director" content="Director 1" />
|
||||||
|
<meta property="og:video:writer" content="Writer 1" />
|
||||||
|
<meta property="og:video:writer" content="Writer 2" />
|
||||||
|
<meta property="og:video:duration" content="45" />
|
||||||
|
<meta property="og:video:release_date" content="2023-05-15" />
|
||||||
|
<meta property="og:video:tag" content="drama" />
|
||||||
|
<meta property="og:video:tag" content="comedy" />
|
||||||
|
<meta property="og:video:series" content="Awesome Series" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Awesome Episode</h1>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""".trimIndent()
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `test parse with video episode-specific tags`() {
|
||||||
|
val openGraphData = parser.parse(videoEpisodeHtml)
|
||||||
|
|
||||||
|
// Verify basic properties
|
||||||
|
assertEquals("Awesome Episode", openGraphData.title)
|
||||||
|
assertEquals("video.episode", openGraphData.type)
|
||||||
|
assertUrlEquals("https://example.com/episodes/awesome-episode", openGraphData.url)
|
||||||
|
assertEquals("An awesome episode", openGraphData.description)
|
||||||
|
|
||||||
|
// Verify video.episode-specific properties
|
||||||
|
assertNotNull(openGraphData.videoEpisode)
|
||||||
|
assertEquals(2, openGraphData.videoEpisode.actors.size)
|
||||||
|
assertTrue(openGraphData.videoEpisode.actors.contains("Actor 1"))
|
||||||
|
assertTrue(openGraphData.videoEpisode.actors.contains("Actor 2"))
|
||||||
|
assertEquals(1, openGraphData.videoEpisode.director.size)
|
||||||
|
assertTrue(openGraphData.videoEpisode.director.contains("Director 1"))
|
||||||
|
assertEquals(2, openGraphData.videoEpisode.writer.size)
|
||||||
|
assertTrue(openGraphData.videoEpisode.writer.contains("Writer 1"))
|
||||||
|
assertTrue(openGraphData.videoEpisode.writer.contains("Writer 2"))
|
||||||
|
assertEquals(45, openGraphData.videoEpisode.duration)
|
||||||
|
assertNotNull(openGraphData.videoEpisode.releaseDate)
|
||||||
|
assertEquals(OffsetDateTime.parse("2023-05-15T00:00:00Z"), openGraphData.videoEpisode.releaseDate)
|
||||||
|
assertEquals(2, openGraphData.videoEpisode.tags.size)
|
||||||
|
assertTrue(openGraphData.videoEpisode.tags.contains("drama"))
|
||||||
|
assertTrue(openGraphData.videoEpisode.tags.contains("comedy"))
|
||||||
|
assertEquals("Awesome Series", openGraphData.videoEpisode.series)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun `test Gender enum toString method`() {
|
||||||
|
// Test that the toString method returns the lowercase name of the enum value
|
||||||
|
assertEquals("male", Gender.MALE.toString())
|
||||||
|
assertEquals("female", Gender.FEMALE.toString())
|
||||||
|
|
||||||
|
// Test that the fromString method correctly converts a string to the enum value
|
||||||
|
assertEquals(Gender.MALE, Gender.fromString("MALE"))
|
||||||
|
assertEquals(Gender.FEMALE, Gender.fromString("FEMALE"))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
1
scrape-test/.gitignore
vendored
Normal file
1
scrape-test/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
data/web
|
||||||
20
scrape-test/README.md
Normal file
20
scrape-test/README.md
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
#Scrape test module
|
||||||
|
|
||||||
|
The scrape test module is intended to test the immplementation of the library at scale by parsing a large amount of webpages and checking the quality of its results
|
||||||
|
|
||||||
|
## Data
|
||||||
|
|
||||||
|
At this moment
|
||||||
|
|
||||||
|
* one dataset was found on [Kaggle](https://www.kaggle.com/datasets/hetulmehta/website-classification).
|
||||||
|
* another on [Moz](https://moz.com/top-500/download/?table=top500Domains) (Top 500 most visited websites).
|
||||||
|
|
||||||
|
I'd like a more varied set of data from different types of sources, and the current set mostly seem to contain homepages but it's surprisingly hard to find.
|
||||||
|
|
||||||
|
## Running the tests
|
||||||
|
|
||||||
|
For various reasons, I am not uploading the actual data of the various URLs. To run the analysis yourself:
|
||||||
|
|
||||||
|
1. Run `Scraper.kt` once, which will grab all the webpages and place them in the `data/web` folder.
|
||||||
|
2. Run `ParserTest.kt`, which will run the `Parser` on each of those web pages and check whether the tags can be extracted, and if the page is considered valid.
|
||||||
|
|
||||||
28
scrape-test/build.gradle.kts
Normal file
28
scrape-test/build.gradle.kts
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
plugins {
|
||||||
|
id("java")
|
||||||
|
kotlin("jvm")
|
||||||
|
}
|
||||||
|
|
||||||
|
group = "fr.lengrand"
|
||||||
|
version = "unspecified"
|
||||||
|
|
||||||
|
repositories {
|
||||||
|
mavenCentral()
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
testImplementation(platform("org.junit:junit-bom:5.10.0"))
|
||||||
|
testImplementation("org.junit.jupiter:junit-jupiter")
|
||||||
|
implementation(kotlin("stdlib-jdk8"))
|
||||||
|
|
||||||
|
implementation(project(":opengraphkt"))
|
||||||
|
implementation("io.ktor:ktor-client-core:3.1.3")
|
||||||
|
implementation("io.ktor:ktor-client-cio:3.1.3")
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
||||||
|
kotlin {
|
||||||
|
jvmToolchain(23)
|
||||||
|
}
|
||||||
501
scrape-test/data/top500.csv
Normal file
501
scrape-test/data/top500.csv
Normal file
@@ -0,0 +1,501 @@
|
|||||||
|
"Rank","Root Domain","Linking Root Domains","Domain Authority"
|
||||||
|
"1","www.google.com","15,236,114","100"
|
||||||
|
"2","www.blogger.com","31,311,113","100"
|
||||||
|
"3","youtube.com","24,336,912","100"
|
||||||
|
"4","linkedin.com","13,291,390","99"
|
||||||
|
"5","support.google.com","5,720,703","99"
|
||||||
|
"6","cloudflare.com","8,211,585","99"
|
||||||
|
"7","microsoft.com","5,593,547","99"
|
||||||
|
"8","apple.com","6,849,526","99"
|
||||||
|
"9","en.wikipedia.org","7,201,596","98"
|
||||||
|
"10","play.google.com","4,012,038","98"
|
||||||
|
"11","wordpress.org","12,511,154","98"
|
||||||
|
"12","docs.google.com","3,642,278","98"
|
||||||
|
"13","mozilla.org","2,593,193","98"
|
||||||
|
"14","maps.google.com","6,190,949","98"
|
||||||
|
"15","youtu.be","5,434,247","98"
|
||||||
|
"16","drive.google.com","2,681,591","97"
|
||||||
|
"17","bp.blogspot.com","18,327,022","97"
|
||||||
|
"18","sites.google.com","2,401,535","97"
|
||||||
|
"19","googleusercontent.com","3,994,187","97"
|
||||||
|
"20","accounts.google.com","2,557,208","97"
|
||||||
|
"21","t.me","1,826,000","97"
|
||||||
|
"22","europa.eu","2,437,683","97"
|
||||||
|
"23","plus.google.com","10,955,614","97"
|
||||||
|
"24","whatsapp.com","4,778,976","97"
|
||||||
|
"25","adobe.com","2,880,183","96"
|
||||||
|
"26","facebook.com","61,926,417","96"
|
||||||
|
"27","policies.google.com","3,521,103","96"
|
||||||
|
"28","uol.com.br","694,206","96"
|
||||||
|
"29","istockphoto.com","3,728,189","96"
|
||||||
|
"30","vimeo.com","3,628,948","96"
|
||||||
|
"31","vk.com","1,869,205","96"
|
||||||
|
"32","github.com","3,170,446","96"
|
||||||
|
"33","amazon.com","5,149,651","96"
|
||||||
|
"34","search.google.com","1,825,467","95"
|
||||||
|
"35","bbc.co.uk","1,750,633","95"
|
||||||
|
"36","google.de","1,083,507","95"
|
||||||
|
"37","live.com","1,022,973","95"
|
||||||
|
"38","gravatar.com","12,679,255","95"
|
||||||
|
"39","nih.gov","1,591,787","95"
|
||||||
|
"40","dan.com","4,340,736","95"
|
||||||
|
"41","files.wordpress.com","7,667,702","95"
|
||||||
|
"42","www.yahoo.com","1,307,493","95"
|
||||||
|
"43","cnn.com","1,672,093","95"
|
||||||
|
"44","dropbox.com","1,124,594","95"
|
||||||
|
"45","wikimedia.org","2,113,156","95"
|
||||||
|
"46","creativecommons.org","1,780,143","95"
|
||||||
|
"47","google.com.br","298,643","95"
|
||||||
|
"48","line.me","1,120,656","95"
|
||||||
|
"49","googleblog.com","4,497,927","95"
|
||||||
|
"50","opera.com","1,037,979","95"
|
||||||
|
"51","es.wikipedia.org","995,228","95"
|
||||||
|
"52","globo.com","468,129","95"
|
||||||
|
"53","brandbucket.com","11,171,565","95"
|
||||||
|
"54","myspace.com","1,364,994","95"
|
||||||
|
"55","slideshare.net","1,002,121","95"
|
||||||
|
"56","paypal.com","1,188,172","95"
|
||||||
|
"57","tiktok.com","1,499,229","95"
|
||||||
|
"58","netvibes.com","1,238,045","95"
|
||||||
|
"59","theguardian.com","1,607,812","95"
|
||||||
|
"60","who.int","2,039,611","95"
|
||||||
|
"61","goo.gl","5,175,255","95"
|
||||||
|
"62","medium.com","1,869,221","95"
|
||||||
|
"63","tools.google.com","1,854,489","95"
|
||||||
|
"64","draft.blogger.com","12,332,795","95"
|
||||||
|
"65","pt.wikipedia.org","426,145","95"
|
||||||
|
"66","fr.wikipedia.org","659,228","95"
|
||||||
|
"67","www.weebly.com","6,870,155","95"
|
||||||
|
"68","news.google.com","870,057","95"
|
||||||
|
"69","developers.google.com","1,170,712","95"
|
||||||
|
"70","w3.org","1,145,140","95"
|
||||||
|
"71","mail.google.com","691,883","95"
|
||||||
|
"72","gstatic.com","642,344","95"
|
||||||
|
"73","jimdofree.com","1,700,543","95"
|
||||||
|
"74","cpanel.net","2,172,574","95"
|
||||||
|
"75","imdb.com","1,561,359","95"
|
||||||
|
"76","wa.me","2,064,272","95"
|
||||||
|
"77","feedburner.com","1,792,625","95"
|
||||||
|
"78","enable-javascript.com","5,140,325","95"
|
||||||
|
"79","nytimes.com","2,218,148","95"
|
||||||
|
"80","workspace.google.com","774,353","95"
|
||||||
|
"81","ok.ru","378,557","95"
|
||||||
|
"82","google.es","480,733","95"
|
||||||
|
"83","dailymotion.com","1,132,411","95"
|
||||||
|
"84","afternic.com","2,480,391","94"
|
||||||
|
"85","bloomberg.com","887,696","94"
|
||||||
|
"86","amazon.de","569,270","94"
|
||||||
|
"87","photos.google.com","278,989","94"
|
||||||
|
"88","wiley.com","704,290","94"
|
||||||
|
"89","aliexpress.com","544,478","94"
|
||||||
|
"90","indiatimes.com","515,121","94"
|
||||||
|
"91","youronlinechoices.com","592,485","94"
|
||||||
|
"92","elpais.com","445,804","94"
|
||||||
|
"93","tinyurl.com","1,475,080","94"
|
||||||
|
"94","yadi.sk","158,812","94"
|
||||||
|
"95","spotify.com","1,828,736","94"
|
||||||
|
"96","huffpost.com","1,193,239","94"
|
||||||
|
"97","ru.wikipedia.org","378,068","94"
|
||||||
|
"98","google.fr","417,006","94"
|
||||||
|
"99","webmd.com","851,236","94"
|
||||||
|
"100","samsung.com","427,587","94"
|
||||||
|
"101","independent.co.uk","784,346","94"
|
||||||
|
"102","amazon.co.jp","884,321","94"
|
||||||
|
"103","get.google.com","626,795","94"
|
||||||
|
"104","amazon.co.uk","806,792","94"
|
||||||
|
"105","4shared.com","574,070","94"
|
||||||
|
"106","telegram.me","445,468","94"
|
||||||
|
"107","planalto.gov.br","126,833","94"
|
||||||
|
"108","businessinsider.com","878,711","94"
|
||||||
|
"109","ig.com.br","159,348","94"
|
||||||
|
"110","issuu.com","1,033,499","94"
|
||||||
|
"111","www.gov.br","187,184","94"
|
||||||
|
"112","wsj.com","999,835","94"
|
||||||
|
"113","hugedomains.com","16,362,198","94"
|
||||||
|
"114","picasaweb.google.com","619,826","94"
|
||||||
|
"115","usatoday.com","896,757","94"
|
||||||
|
"116","scribd.com","774,108","94"
|
||||||
|
"117","www.gov.uk","652,298","94"
|
||||||
|
"118","storage.googleapis.com","1,116,110","94"
|
||||||
|
"119","huffingtonpost.com","1,066,209","94"
|
||||||
|
"120","bbc.com","939,892","94"
|
||||||
|
"121","estadao.com.br","138,770","94"
|
||||||
|
"122","nature.com","690,832","94"
|
||||||
|
"123","mediafire.com","897,442","94"
|
||||||
|
"124","washingtonpost.com","1,194,795","94"
|
||||||
|
"125","forms.gle","966,378","94"
|
||||||
|
"126","namecheap.com","1,072,723","94"
|
||||||
|
"127","forbes.com","1,489,798","94"
|
||||||
|
"128","mirror.co.uk","428,559","94"
|
||||||
|
"129","soundcloud.com","1,918,274","94"
|
||||||
|
"130","fb.com","486,554","94"
|
||||||
|
"131","marketingplatform.google....","917,237","94"
|
||||||
|
"132","domainmarket.com","943,784","94"
|
||||||
|
"133","ytimg.com","1,070,336","94"
|
||||||
|
"134","terra.com.br","200,774","94"
|
||||||
|
"135","google.co.uk","590,081","94"
|
||||||
|
"136","shutterstock.com","563,596","94"
|
||||||
|
"137","dailymail.co.uk","1,133,056","94"
|
||||||
|
"138","reg.ru","540,012","94"
|
||||||
|
"139","t.co","2,196,874","94"
|
||||||
|
"140","cdc.gov","962,959","94"
|
||||||
|
"141","thesun.co.uk","430,774","94"
|
||||||
|
"142","wp.com","2,247,022","94"
|
||||||
|
"143","cnet.com","761,368","94"
|
||||||
|
"144","instagram.com","30,964,179","94"
|
||||||
|
"145","researchgate.net","797,083","94"
|
||||||
|
"146","google.it","443,661","94"
|
||||||
|
"147","fandom.com","659,960","94"
|
||||||
|
"148","office.com","756,429","94"
|
||||||
|
"149","list-manage.com","795,381","94"
|
||||||
|
"150","msn.com","1,086,862","94"
|
||||||
|
"151","un.org","649,321","94"
|
||||||
|
"152","de.wikipedia.org","682,828","94"
|
||||||
|
"153","ovh.com","678,504","94"
|
||||||
|
"154","mail.ru","481,079","94"
|
||||||
|
"155","bing.com","1,075,800","94"
|
||||||
|
"156","news.yahoo.com","720,761","94"
|
||||||
|
"157","myaccount.google.com","378,003","94"
|
||||||
|
"158","hatena.ne.jp","1,641,114","94"
|
||||||
|
"159","shopify.com","3,485,537","94"
|
||||||
|
"160","adssettings.google.com","482,210","94"
|
||||||
|
"161","bit.ly","5,047,889","94"
|
||||||
|
"162","reuters.com","971,280","94"
|
||||||
|
"163","booking.com","431,062","94"
|
||||||
|
"164","discord.com","507,193","94"
|
||||||
|
"165","buydomains.com","1,240,200","94"
|
||||||
|
"166","nasa.gov","707,621","94"
|
||||||
|
"167","aboutads.info","723,570","94"
|
||||||
|
"168","time.com","858,096","94"
|
||||||
|
"169","abril.com.br","279,992","94"
|
||||||
|
"170","change.org","509,706","94"
|
||||||
|
"171","nginx.org","1,177,240","94"
|
||||||
|
"172","twitter.com","61,414,860","94"
|
||||||
|
"173","www.wikipedia.org","553,224","94"
|
||||||
|
"174","archive.org","1,389,450","94"
|
||||||
|
"175","cbsnews.com","705,330","94"
|
||||||
|
"176","networkadvertising.org","707,285","94"
|
||||||
|
"177","telegraph.co.uk","1,014,460","94"
|
||||||
|
"178","pinterest.com","9,106,097","94"
|
||||||
|
"179","google.co.jp","648,233","94"
|
||||||
|
"180","pixabay.com","510,897","94"
|
||||||
|
"181","zendesk.com","542,253","93"
|
||||||
|
"182","cpanel.com","970,971","93"
|
||||||
|
"183","vistaprint.com","815,228","93"
|
||||||
|
"184","sky.com","251,300","93"
|
||||||
|
"185","windows.net","460,289","93"
|
||||||
|
"186","alicdn.com","572,952","93"
|
||||||
|
"187","google.ca","339,139","93"
|
||||||
|
"188","lemonde.fr","287,813","93"
|
||||||
|
"189","newyorker.com","453,295","93"
|
||||||
|
"190","webnode.page","516,598","93"
|
||||||
|
"191","surveymonkey.com","459,483","93"
|
||||||
|
"192","translate.google.com","297,180","93"
|
||||||
|
"193","calendar.google.com","227,821","93"
|
||||||
|
"194","amazonaws.com","419,795","93"
|
||||||
|
"195","academia.edu","426,259","93"
|
||||||
|
"196","apache.org","1,094,485","93"
|
||||||
|
"197","imageshack.us","707,362","93"
|
||||||
|
"198","akamaihd.net","599,670","93"
|
||||||
|
"199","nginx.com","1,090,331","93"
|
||||||
|
"200","discord.gg","467,826","93"
|
||||||
|
"201","thetimes.co.uk","435,853","93"
|
||||||
|
"202","search.yahoo.com","525,853","93"
|
||||||
|
"203","amazon.fr","269,771","93"
|
||||||
|
"204","yelp.com","987,170","93"
|
||||||
|
"205","berkeley.edu","510,730","93"
|
||||||
|
"206","google.ru","198,345","93"
|
||||||
|
"207","sedoparking.com","726,100","93"
|
||||||
|
"208","cbc.ca","512,969","93"
|
||||||
|
"209","unesco.org","372,647","93"
|
||||||
|
"210","ggpht.com","790,656","93"
|
||||||
|
"211","privacyshield.gov","426,999","93"
|
||||||
|
"212","www.over-blog.com","811,888","93"
|
||||||
|
"213","clarin.com","150,420","93"
|
||||||
|
"214","www.wix.com","2,809,873","93"
|
||||||
|
"215","whitehouse.gov","423,904","93"
|
||||||
|
"216","icann.org","646,968","93"
|
||||||
|
"217","gnu.org","597,621","93"
|
||||||
|
"218","yandex.ru","1,067,333","93"
|
||||||
|
"219","francetvinfo.fr","184,067","93"
|
||||||
|
"220","gmail.com","216,193","93"
|
||||||
|
"221","mozilla.com","208,426","93"
|
||||||
|
"222","ziddu.com","192,713","93"
|
||||||
|
"223","guardian.co.uk","597,395","93"
|
||||||
|
"224","twitch.tv","553,259","93"
|
||||||
|
"225","sedo.com","2,498,739","93"
|
||||||
|
"226","foxnews.com","600,891","93"
|
||||||
|
"227","rambler.ru","934,373","93"
|
||||||
|
"228","books.google.com","453,152","93"
|
||||||
|
"229","stanford.edu","672,405","93"
|
||||||
|
"230","wikihow.com","664,665","93"
|
||||||
|
"231","it.wikipedia.org","354,035","93"
|
||||||
|
"232","20minutos.es","183,667","93"
|
||||||
|
"233","sfgate.com","387,976","93"
|
||||||
|
"234","liveinternet.ru","358,214","93"
|
||||||
|
"235","ja.wikipedia.org","377,585","93"
|
||||||
|
"236","000webhost.com","243,969","93"
|
||||||
|
"237","espn.com","410,145","93"
|
||||||
|
"238","eventbrite.com","685,661","93"
|
||||||
|
"239","disney.com","269,415","93"
|
||||||
|
"240","statista.com","428,155","93"
|
||||||
|
"241","addthis.com","637,495","93"
|
||||||
|
"242","pinterest.fr","122,907","93"
|
||||||
|
"243","lavanguardia.com","173,195","93"
|
||||||
|
"244","vkontakte.ru","336,278","93"
|
||||||
|
"245","doubleclick.net","496,992","93"
|
||||||
|
"246","bp2.blogger.com","561,940","93"
|
||||||
|
"247","skype.com","437,720","93"
|
||||||
|
"248","sciencedaily.com","380,140","93"
|
||||||
|
"249","bloglovin.com","547,875","93"
|
||||||
|
"250","insider.com","407,525","93"
|
||||||
|
"251","pl.wikipedia.org","150,769","93"
|
||||||
|
"252","sputniknews.com","185,708","93"
|
||||||
|
"253","id.wikipedia.org","567,598","93"
|
||||||
|
"254","doi.org","619,455","93"
|
||||||
|
"255","nypost.com","457,559","93"
|
||||||
|
"256","elmundo.es","248,073","93"
|
||||||
|
"257","abcnews.go.com","596,975","93"
|
||||||
|
"258","ipv4.google.com","325,798","93"
|
||||||
|
"259","deezer.com","177,906","93"
|
||||||
|
"260","express.co.uk","393,591","93"
|
||||||
|
"261","detik.com","407,103","93"
|
||||||
|
"262","mystrikingly.com","424,979","93"
|
||||||
|
"263","rakuten.co.jp","672,132","93"
|
||||||
|
"264","amzn.to","1,050,344","93"
|
||||||
|
"265","arxiv.org","293,699","93"
|
||||||
|
"266","alibaba.com","403,447","93"
|
||||||
|
"267","fb.me","354,320","93"
|
||||||
|
"268","wikia.com","505,599","93"
|
||||||
|
"269","t-online.de","263,484","93"
|
||||||
|
"270","telegra.ph","336,651","93"
|
||||||
|
"271","mega.nz","211,685","93"
|
||||||
|
"272","usnews.com","487,873","93"
|
||||||
|
"273","plos.org","342,614","93"
|
||||||
|
"274","naver.com","551,101","93"
|
||||||
|
"275","ibm.com","551,601","93"
|
||||||
|
"276","smh.com.au","339,840","93"
|
||||||
|
"277","dw.com","414,201","93"
|
||||||
|
"278","google.nl","276,116","93"
|
||||||
|
"279","lefigaro.fr","234,085","93"
|
||||||
|
"280","bp1.blogger.com","561,364","93"
|
||||||
|
"281","picasa.google.com","257,556","93"
|
||||||
|
"282","theatlantic.com","598,135","93"
|
||||||
|
"283","nydailynews.com","390,352","93"
|
||||||
|
"284","themeforest.net","545,669","93"
|
||||||
|
"285","rtve.es","200,359","93"
|
||||||
|
"286","newsweek.com","423,875","93"
|
||||||
|
"287","ovh.net","543,402","93"
|
||||||
|
"288","ca.gov","581,381","93"
|
||||||
|
"289","goodreads.com","954,326","93"
|
||||||
|
"290","economist.com","404,606","93"
|
||||||
|
"291","target.com","317,574","93"
|
||||||
|
"292","marca.com","126,447","93"
|
||||||
|
"293","kickstarter.com","504,234","93"
|
||||||
|
"294","hindustantimes.com","262,717","93"
|
||||||
|
"295","weibo.com","1,415,948","93"
|
||||||
|
"296","finance.yahoo.com","508,841","93"
|
||||||
|
"297","huawei.com","1,938,323","93"
|
||||||
|
"298","e-monsite.com","145,440","93"
|
||||||
|
"299","hubspot.com","385,555","93"
|
||||||
|
"300","npr.org","895,466","93"
|
||||||
|
"301","netflix.com","404,990","93"
|
||||||
|
"302","gizmodo.com","410,719","93"
|
||||||
|
"303","netlify.app","445,381","93"
|
||||||
|
"304","yandex.com","1,553,717","93"
|
||||||
|
"305","mashable.com","457,794","93"
|
||||||
|
"306","cnil.fr","251,699","93"
|
||||||
|
"307","latimes.com","748,209","93"
|
||||||
|
"308","steampowered.com","318,831","93"
|
||||||
|
"309","rt.com","313,229","93"
|
||||||
|
"310","photobucket.com","1,625,154","93"
|
||||||
|
"311","quora.com","495,634","93"
|
||||||
|
"312","nbcnews.com","722,611","93"
|
||||||
|
"313","android.com","323,946","93"
|
||||||
|
"314","instructables.com","364,549","93"
|
||||||
|
"315","www.canalblog.com","410,731","93"
|
||||||
|
"316","www.livejournal.com","3,021,508","93"
|
||||||
|
"317","ouest-france.fr","145,514","93"
|
||||||
|
"318","tripadvisor.com","784,126","93"
|
||||||
|
"319","ovhcloud.com","683,341","93"
|
||||||
|
"320","pexels.com","397,595","93"
|
||||||
|
"321","oracle.com","646,434","93"
|
||||||
|
"322","yahoo.co.jp","670,052","93"
|
||||||
|
"323","addtoany.com","794,722","93"
|
||||||
|
"324","sakura.ne.jp","413,375","93"
|
||||||
|
"325","cointernet.com.co","693,382","93"
|
||||||
|
"326","twimg.com","825,776","93"
|
||||||
|
"327","britannica.com","588,729","93"
|
||||||
|
"328","php.net","689,106","93"
|
||||||
|
"329","standard.co.uk","274,064","93"
|
||||||
|
"330","groups.google.com","419,267","93"
|
||||||
|
"331","cnbc.com","613,798","93"
|
||||||
|
"332","loc.gov","492,836","93"
|
||||||
|
"333","qq.com","4,380,702","93"
|
||||||
|
"334","buzzfeed.com","637,933","93"
|
||||||
|
"335","godaddy.com","2,529,806","93"
|
||||||
|
"336","ikea.com","384,319","93"
|
||||||
|
"337","disqus.com","928,440","93"
|
||||||
|
"338","taringa.net","151,257","93"
|
||||||
|
"339","ea.com","237,847","93"
|
||||||
|
"340","dropcatch.com","875,677","93"
|
||||||
|
"341","techcrunch.com","543,668","93"
|
||||||
|
"342","canva.com","362,582","93"
|
||||||
|
"343","offset.com","1,988,241","93"
|
||||||
|
"344","ebay.com","961,717","93"
|
||||||
|
"345","zoom.us","704,595","93"
|
||||||
|
"346","cambridge.org","397,605","93"
|
||||||
|
"347","unsplash.com","687,293","93"
|
||||||
|
"348","playstation.com","273,692","93"
|
||||||
|
"349","people.com","288,670","93"
|
||||||
|
"350","springer.com","573,622","93"
|
||||||
|
"351","psychologytoday.com","418,913","93"
|
||||||
|
"352","sendspace.com","151,427","93"
|
||||||
|
"353","home.pl","217,375","93"
|
||||||
|
"354","rapidshare.com","293,608","93"
|
||||||
|
"355","prezi.com","344,865","93"
|
||||||
|
"356","photos1.blogger.com","780,018","93"
|
||||||
|
"357","thenai.org","462,289","93"
|
||||||
|
"358","ftc.gov","314,810","93"
|
||||||
|
"359","google.pl","190,864","93"
|
||||||
|
"360","ted.com","657,104","93"
|
||||||
|
"361","secureserver.net","872,578","93"
|
||||||
|
"362","code.google.com","317,681","93"
|
||||||
|
"363","plesk.com","718,603","93"
|
||||||
|
"364","aol.com","670,116","93"
|
||||||
|
"365","biglobe.ne.jp","270,506","93"
|
||||||
|
"366","hp.com","504,078","93"
|
||||||
|
"367","canada.ca","330,831","93"
|
||||||
|
"368","linktr.ee","605,356","93"
|
||||||
|
"369","hollywoodreporter.com","319,018","93"
|
||||||
|
"370","ietf.org","414,873","93"
|
||||||
|
"371","clickbank.net","469,022","93"
|
||||||
|
"372","harvard.edu","826,745","93"
|
||||||
|
"373","amazon.es","204,999","93"
|
||||||
|
"374","oup.com","460,417","93"
|
||||||
|
"375","timeweb.ru","664,889","93"
|
||||||
|
"376","engadget.com","448,101","93"
|
||||||
|
"377","vice.com","427,956","93"
|
||||||
|
"378","cornell.edu","496,567","93"
|
||||||
|
"379","dreamstime.com","454,338","93"
|
||||||
|
"380","tmz.com","243,067","93"
|
||||||
|
"381","gofundme.com","371,903","93"
|
||||||
|
"382","pbs.org","564,342","93"
|
||||||
|
"383","stackoverflow.com","413,584","93"
|
||||||
|
"384","abc.net.au","429,595","93"
|
||||||
|
"385","sciencedirect.com","754,026","93"
|
||||||
|
"386","ft.com","528,414","93"
|
||||||
|
"387","variety.com","333,021","93"
|
||||||
|
"388","alexa.com","307,499","93"
|
||||||
|
"389","abc.es","213,186","93"
|
||||||
|
"390","walmart.com","390,872","93"
|
||||||
|
"391","gooyaabitemplates.com","600,604","93"
|
||||||
|
"392","redbull.com","175,031","93"
|
||||||
|
"393","ssl-images-amazon.com","587,621","93"
|
||||||
|
"394","theverge.com","439,537","93"
|
||||||
|
"395","spiegel.de","374,585","93"
|
||||||
|
"396","about.com","732,285","93"
|
||||||
|
"397","nationalgeographic.com","653,291","93"
|
||||||
|
"398","bandcamp.com","648,189","93"
|
||||||
|
"399","m.wikipedia.org","353,593","93"
|
||||||
|
"400","zippyshare.com","189,307","93"
|
||||||
|
"401","wired.com","713,599","93"
|
||||||
|
"402","freepik.com","386,749","93"
|
||||||
|
"403","outlook.com","427,062","93"
|
||||||
|
"404","mit.edu","757,903","93"
|
||||||
|
"405","sapo.pt","240,677","93"
|
||||||
|
"406","goo.ne.jp","332,616","92"
|
||||||
|
"407","java.com","151,581","92"
|
||||||
|
"408","google.co.th","120,992","92"
|
||||||
|
"409","scmp.com","204,983","92"
|
||||||
|
"410","mayoclinic.org","464,667","92"
|
||||||
|
"411","scholastic.com","200,881","92"
|
||||||
|
"412","nba.com","253,654","92"
|
||||||
|
"413","reverbnation.com","227,313","92"
|
||||||
|
"414","depositfiles.com","143,553","92"
|
||||||
|
"415","video.google.com","213,021","92"
|
||||||
|
"416","howstuffworks.com","324,205","92"
|
||||||
|
"417","cbslocal.com","310,532","92"
|
||||||
|
"418","merriam-webster.com","348,548","92"
|
||||||
|
"419","focus.de","167,487","92"
|
||||||
|
"420","admin.ch","232,963","92"
|
||||||
|
"421","gfycat.com","161,812","92"
|
||||||
|
"422","com.com","246,021","92"
|
||||||
|
"423","narod.ru","272,108","92"
|
||||||
|
"424","boston.com","327,894","92"
|
||||||
|
"425","sony.com","176,593","92"
|
||||||
|
"426","justjared.com","124,409","92"
|
||||||
|
"427","bitly.com","371,241","92"
|
||||||
|
"428","jstor.org","275,031","92"
|
||||||
|
"429","amebaownd.com","192,731","92"
|
||||||
|
"430","g.co","188,465","92"
|
||||||
|
"431","gsmarena.com","137,657","92"
|
||||||
|
"432","lexpress.fr","123,852","92"
|
||||||
|
"433","reddit.com","7,039,676","92"
|
||||||
|
"434","usgs.gov","255,999","92"
|
||||||
|
"435","bigcommerce.com","355,749","92"
|
||||||
|
"436","gettyimages.com","404,317","92"
|
||||||
|
"437","ign.com","331,808","92"
|
||||||
|
"438","justgiving.com","162,197","92"
|
||||||
|
"439","techradar.com","212,371","92"
|
||||||
|
"440","weather.com","233,771","92"
|
||||||
|
"441","amazon.ca","256,117","92"
|
||||||
|
"442","justice.gov","233,706","92"
|
||||||
|
"443","sciencemag.org","317,941","92"
|
||||||
|
"444","pcmag.com","317,511","92"
|
||||||
|
"445","theconversation.com","373,863","92"
|
||||||
|
"446","foursquare.com","274,397","92"
|
||||||
|
"447","flickr.com","9,156,536","92"
|
||||||
|
"448","giphy.com","471,107","92"
|
||||||
|
"449","tvtropes.org","143,379","92"
|
||||||
|
"450","fifa.com","201,899","92"
|
||||||
|
"451","upenn.edu","339,397","92"
|
||||||
|
"452","digg.com","816,971","92"
|
||||||
|
"453","bestfreecams.club","394,385","92"
|
||||||
|
"454","histats.com","452,459","92"
|
||||||
|
"455","salesforce.com","256,815","92"
|
||||||
|
"456","blog.google","157,787","92"
|
||||||
|
"457","apnews.com","331,978","92"
|
||||||
|
"458","theglobeandmail.com","275,289","92"
|
||||||
|
"459","m.me","268,002","92"
|
||||||
|
"460","europapress.es","120,539","92"
|
||||||
|
"461","washington.edu","390,985","92"
|
||||||
|
"462","thefreedictionary.com","269,796","92"
|
||||||
|
"463","jhu.edu","263,019","92"
|
||||||
|
"464","euronews.com","220,805","92"
|
||||||
|
"465","liberation.fr","127,411","92"
|
||||||
|
"466","ads.google.com","167,381","92"
|
||||||
|
"467","trustpilot.com","528,385","92"
|
||||||
|
"468","google.com.tw","148,361","92"
|
||||||
|
"469","softonic.com","163,143","92"
|
||||||
|
"470","kakao.com","173,573","92"
|
||||||
|
"471","storage.canalblog.com","320,126","92"
|
||||||
|
"472","interia.pl","163,015","92"
|
||||||
|
"473","metro.co.uk","286,308","92"
|
||||||
|
"474","viglink.com","398,156","92"
|
||||||
|
"475","last.fm","444,132","92"
|
||||||
|
"476","blackberry.com","145,035","92"
|
||||||
|
"477","public-api.wordpress.com","188,788","92"
|
||||||
|
"478","sina.com.cn","993,710","92"
|
||||||
|
"479","unicef.org","222,033","92"
|
||||||
|
"480","archives.gov","286,115","92"
|
||||||
|
"481","nps.gov","392,668","92"
|
||||||
|
"482","utexas.edu","291,645","92"
|
||||||
|
"483","biblegateway.com","289,507","92"
|
||||||
|
"484","usda.gov","446,386","92"
|
||||||
|
"485","indiegogo.com","276,002","92"
|
||||||
|
"486","nikkei.com","262,714","92"
|
||||||
|
"487","radiofrance.fr","137,232","92"
|
||||||
|
"488","repubblica.it","216,052","92"
|
||||||
|
"489","substack.com","272,562","92"
|
||||||
|
"490","ap.org","195,696","92"
|
||||||
|
"491","nicovideo.jp","165,700","92"
|
||||||
|
"492","joomla.org","224,141","92"
|
||||||
|
"493","news.com.au","300,915","92"
|
||||||
|
"494","allaboutcookies.org","477,121","92"
|
||||||
|
"495","mailchimp.com","421,675","92"
|
||||||
|
"496","stores.jp","440,302","92"
|
||||||
|
"497","intel.com","268,542","92"
|
||||||
|
"498","bp0.blogger.com","561,170","92"
|
||||||
|
"499","box.com","288,327","92"
|
||||||
|
"500","nhk.or.jp","256,113","92"
|
||||||
|
1409
scrape-test/data/website_classification.csv
Normal file
1409
scrape-test/data/website_classification.csv
Normal file
File diff suppressed because one or more lines are too long
42
scrape-test/src/main/kotlin/fr/lengrand/scrape/ParserTest.kt
Normal file
42
scrape-test/src/main/kotlin/fr/lengrand/scrape/ParserTest.kt
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
package fr.lengrand.scrape
|
||||||
|
|
||||||
|
import fr.lengrand.opengraphkt.Parser
|
||||||
|
import java.net.URI
|
||||||
|
import java.nio.file.Files
|
||||||
|
import java.nio.file.Paths
|
||||||
|
|
||||||
|
fun main() {
|
||||||
|
|
||||||
|
val parser = Parser()
|
||||||
|
|
||||||
|
var total = 0
|
||||||
|
var success = 0
|
||||||
|
var error = 0
|
||||||
|
var valid = 0
|
||||||
|
|
||||||
|
val websiteFolder = "./scrape-test/data/web"
|
||||||
|
val path = Paths.get(websiteFolder)
|
||||||
|
|
||||||
|
Files.walk(path)
|
||||||
|
.forEach {
|
||||||
|
println("filename: $it")
|
||||||
|
total++
|
||||||
|
try{
|
||||||
|
val openGraphData = parser.parse(URI("https://www.imdb.com/title/tt0068646/").toURL())
|
||||||
|
success++
|
||||||
|
|
||||||
|
if(openGraphData.isValid()) {
|
||||||
|
valid++
|
||||||
|
}
|
||||||
|
|
||||||
|
}catch (e: Exception) {
|
||||||
|
println("Error parsing URL: ${e.message}")
|
||||||
|
error++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
println("Total: $total")
|
||||||
|
println("Success: $success")
|
||||||
|
println("Valid: $valid")
|
||||||
|
println("Error: $error")
|
||||||
|
}
|
||||||
146
scrape-test/src/main/kotlin/fr/lengrand/scrape/Scraper.kt
Normal file
146
scrape-test/src/main/kotlin/fr/lengrand/scrape/Scraper.kt
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
package fr.lengrand.scrape
|
||||||
|
|
||||||
|
import io.ktor.client.HttpClient
|
||||||
|
import io.ktor.client.engine.cio.CIO
|
||||||
|
import io.ktor.client.plugins.BrowserUserAgent
|
||||||
|
import io.ktor.client.plugins.HttpTimeout
|
||||||
|
import io.ktor.client.request.get
|
||||||
|
import io.ktor.client.statement.HttpResponse
|
||||||
|
import io.ktor.client.statement.bodyAsText
|
||||||
|
import kotlinx.coroutines.Dispatchers
|
||||||
|
import kotlinx.coroutines.joinAll
|
||||||
|
import kotlinx.coroutines.launch
|
||||||
|
import kotlinx.coroutines.withContext
|
||||||
|
import java.io.File
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger
|
||||||
|
|
||||||
|
data class Website(
|
||||||
|
val id: Int,
|
||||||
|
val url: String,
|
||||||
|
val description: String,
|
||||||
|
val category: String,
|
||||||
|
)
|
||||||
|
|
||||||
|
// Process starts in main repo folder, not the module itself
|
||||||
|
val dataFile = "./scrape-test/data/website_classification.csv"
|
||||||
|
val siteDirectory = "./scrape-test/data/web"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bulk webpage scraper using Ktor to efficiently download HTML from multiple URLs in parallel
|
||||||
|
* and save the content to individual files.
|
||||||
|
*/
|
||||||
|
@OptIn(kotlinx.coroutines.ExperimentalCoroutinesApi::class)
|
||||||
|
class WebScraper(
|
||||||
|
private val outputDirectory: String = "scrape-test/data/web",
|
||||||
|
private val concurrencyLevel: Int = 20,
|
||||||
|
private val requestTimeoutMillis: Long = 30000
|
||||||
|
){
|
||||||
|
private val client = HttpClient(CIO) {
|
||||||
|
install(HttpTimeout) {
|
||||||
|
requestTimeoutMillis = this@WebScraper.requestTimeoutMillis
|
||||||
|
}
|
||||||
|
BrowserUserAgent()
|
||||||
|
}
|
||||||
|
|
||||||
|
private val completedCount = AtomicInteger(0)
|
||||||
|
private val failedCount = AtomicInteger(0)
|
||||||
|
private val totalCount = AtomicInteger(0)
|
||||||
|
|
||||||
|
init {
|
||||||
|
File(outputDirectory).mkdirs()
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scrapes a single webpage and saves it to a file.
|
||||||
|
*
|
||||||
|
* @param url The URL to scrape
|
||||||
|
* @param outputFilePath The file path to save the HTML content
|
||||||
|
*/
|
||||||
|
private suspend fun scrapeWebpage(url: String, outputFilePath: String) {
|
||||||
|
try {
|
||||||
|
val response: HttpResponse = client.get(url)
|
||||||
|
val htmlContent = response.bodyAsText()
|
||||||
|
File(outputFilePath).writeText(htmlContent)
|
||||||
|
|
||||||
|
println("[${completedCount.incrementAndGet()}/${totalCount.get()}] Successfully scraped: $url")
|
||||||
|
} catch (e: Exception) {
|
||||||
|
println("[${failedCount.incrementAndGet()}/${totalCount.get()}] Failed to scrape $url: ${e.message}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scrapes multiple webpages in parallel and saves them to files.
|
||||||
|
*
|
||||||
|
* @param urls List of URLs to scrape
|
||||||
|
*/
|
||||||
|
suspend fun scrapeWebpages(urls: List<String>) {
|
||||||
|
totalCount.set(urls.size)
|
||||||
|
completedCount.set(0)
|
||||||
|
failedCount.set(0)
|
||||||
|
|
||||||
|
println("Starting to scrape ${urls.size} URLs with concurrency level: $concurrencyLevel")
|
||||||
|
|
||||||
|
// Create a coroutine dispatcher with a fixed thread pool
|
||||||
|
val dispatcher = Dispatchers.IO.limitedParallelism(concurrencyLevel)
|
||||||
|
|
||||||
|
withContext(dispatcher) {
|
||||||
|
urls.mapIndexed { index, url ->
|
||||||
|
val filename = sanitizeURls(url)
|
||||||
|
val outputPath = "$outputDirectory/$filename"
|
||||||
|
|
||||||
|
// Launch a coroutine for each URL
|
||||||
|
launch {
|
||||||
|
scrapeWebpage(url, outputPath)
|
||||||
|
}
|
||||||
|
}.joinAll() // Wait for all coroutines to complete
|
||||||
|
}
|
||||||
|
|
||||||
|
println("Scraping completed. Total: ${urls.size}, Successful: ${completedCount.get()}, Failed: ${failedCount.get()}")
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates a safe filename from a URL.
|
||||||
|
*/
|
||||||
|
fun sanitizeURls(url: String): String {
|
||||||
|
val sanitizedUrl = url
|
||||||
|
.replace(Regex("^https?://"), "")
|
||||||
|
.replace(Regex("[^a-zA-Z0-9.-]"), "_")
|
||||||
|
|
||||||
|
return sanitizedUrl
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes the HTTP client and releases resources.
|
||||||
|
*/
|
||||||
|
fun close() {
|
||||||
|
client.close()
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
suspend fun main(){
|
||||||
|
|
||||||
|
val stream = File(dataFile).inputStream()
|
||||||
|
val reader = stream.bufferedReader()
|
||||||
|
reader.readLine() // Skips headers
|
||||||
|
|
||||||
|
val websites = reader.lineSequence()
|
||||||
|
.map {
|
||||||
|
val (id, url, description, category) = it.split(",")
|
||||||
|
Website(
|
||||||
|
id.toInt(),
|
||||||
|
url,
|
||||||
|
description,
|
||||||
|
category
|
||||||
|
)
|
||||||
|
}.toList()
|
||||||
|
|
||||||
|
val urls = websites.map { it.url }
|
||||||
|
val scraper = WebScraper(
|
||||||
|
outputDirectory = siteDirectory,
|
||||||
|
concurrencyLevel = 20,
|
||||||
|
requestTimeoutMillis = 30000
|
||||||
|
)
|
||||||
|
|
||||||
|
try { scraper.scrapeWebpages(urls) } finally { scraper.close() }
|
||||||
|
}
|
||||||
@@ -9,4 +9,5 @@ plugins {
|
|||||||
rootProject.name = "OpenGraphKt"
|
rootProject.name = "OpenGraphKt"
|
||||||
include("opengraphkt")
|
include("opengraphkt")
|
||||||
include("demo")
|
include("demo")
|
||||||
include("demo-remote")
|
include("demo-remote")
|
||||||
|
include("scrape-test")
|
||||||
Reference in New Issue
Block a user