[Kotlin]爬取并下载itbook.top的电子书
import com.alibaba.fastjson.JSON import org.jsoup.Connection import org.jsoup.Jsoup import java.util.HashMap import java.io.File import java.io.FileOutputStream var LOGIN_URL = "http://itbook.top/login" var USER_AGENT = "User-Agent" var USER_AGENT_VALUE = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0" fun main(args: Array<String>) { simulateLogin("用户名", "密码") } /** * @param userName 用户名 * @param pwd 密码 * @throws Exception */ @Throws(Exception::class) fun simulateLogin(userName: String, pwd: String) { val header = HashMap<String, String>() header["Host"] = "itbook.top" header["Origin"] = "http://itbook.top" header["Accept"] = "*/*" header["Accept-Encoding"] = "gzip, deflate" header["Accept-Language"] = "zh-TW,zh-CN;q=0.9,zh;q=0.8,en;q=0.7" header["Accept-Charset"] = " GB2312,utf-8;q=0.7,*;q=0.7" header["Content-Type"] = "application/x-www-form-urlencoded; charset=UTF-8" header["Connection"] = "keep-alive" header["X-Requested-With"] = "XMLHttpRequest" /* * 第一次请求 * grab login form page first * 获取登陆提交的表单信息,及修改其提交data数据(login,password) */ val con = Jsoup.connect(LOGIN_URL) // 获取connection con.header(USER_AGENT, USER_AGENT_VALUE) // 配置模拟浏览器 val rs = con.execute() // 获取响应 val d1 = Jsoup.parse(rs.body()) // 转换为Dom树 val eleList = d1.getElementsByClass("form-horizontal") // 获取提交form表单,可以通过查看页面源码代码得知 // 获取cooking和表单属性 // lets make data map containing all the parameters and its values found in the form val datas = HashMap<String, String>() for (e in eleList[0].allElements) { // 设置用户名 if (e.attr("name").equals("email")) { e.attr("value", userName) } // 设置用户密码 if (e.attr("name").equals("password")) { e.attr("value", pwd) } // 排除空值表单属性 if (e.attr("name").length > 0) { datas[e.attr("name")] = e.attr("value") } } /* * 第二次请求,以post方式提交表单数据以及cookie信息 */ val con2 = Jsoup.connect("http://itbook.top/login") con2.header(USER_AGENT, USER_AGENT_VALUE) // 设置cookie和post上面的map数据 val login = con2.ignoreContentType(true).followRedirects(true).method(Connection.Method.POST).data(datas).cookies(rs.cookies()).execute() // 登陆成功后的cookie信息,可以保存到本地,以后登陆时,只需一次登陆即可 //val map = login.cookies() (1..368).toList().fold(ArrayList<Map<String, String>>()) { accBook, elements -> println("==========================================") println("连接" + "http://itbook.top/?page=" + elements) Jsoup.connect("http://itbook.top/?page=" + elements).timeout(5000).cookies(login.cookies()).get().getElementsByClass("lbi-name").fold(ArrayList<Map<String, String>>()) { acc, elements -> elements.getElementsByTag("a") .mapTo(accBook) { var pdfInfo = mapOf( "name" to it.getElementsByTag("a").html(), "book_id" to it.getElementsByTag("a").attr("href").substringAfter("k/") ) var fileName = (pdfInfo.get("name") + ".pdf").replace("/", "&") var pdf = File("/media/hsdllcw/sda6/hsdllcw/Documents/Book/IT/itbook/English/${fileName}") if (!pdf.exists()) { println("正在获取${fileName}的下载地址") var con3 = Jsoup.connect("http://itbook.top/book/get_download_url").header(USER_AGENT, USER_AGENT_VALUE) header.forEach { s, y -> con3.header(s, y) } con3.header("Referer", "http://itbook.top/book/download?book=${pdfInfo.get("book_id")}").method(Connection.Method.POST).cookies(login.cookies()).data(mapOf("book_id" to pdfInfo.get("book_id"), "_token" to Jsoup.connect("http://itbook.top/book/download").timeout(5000).ignoreContentType(true).followRedirects(true).cookies(login.cookies()).data(mapOf("book" to pdfInfo.get("book_id"))).get().toString().substringAfter("_token:\"").substringBefore("\",book_id"))) var url = (JSON.parse((con3.execute().body())) as Map<String, String>).get("download_url") println("获取完毕,${fileName}的下载地址为${url}") println("开始下载${fileName}") try { val resultImageResponse = Jsoup.connect(url).maxBodySize(Int.MAX_VALUE).ignoreContentType(true).execute() val out = FileOutputStream(pdf) out.write(resultImageResponse.bodyAsBytes()) out.close() println("${fileName}\t已下载") } catch (e: Exception) { println("${fileName}\t下载失败") println("url:${url}") } } else { println("${fileName}\t已存在,跳过") } println("========================================") pdfInfo } } } } |
Tips:我觉得我一句话不说也不好,中国有句古话,叫“?????”。
那么,如何去掉
说实话,这个东西,我想你们应该知道怎么去掉吧?如果一定要问我,你可以下载一个chrome浏览器,然后进入开发者工具,选择这个元素,再把它删了就行了。【微笑(迫真)】