通过topic的编号下载单篇幅文章,附件,缩略图到本地。
Global Const $gc_StoreFolder = @ScriptDir & "\Autoitscript\";主目录
Global Const $gc_HtmlFileFolder = $gc_StoreFolder & "html\";html 页面目录
Global Const $gc_TopicList_File = $gc_StoreFolder & "TopicList.ini";文章列表
Global Const $gc_TopicList_Section = "TopicList"
Global Const $gc_AttachFileExt = "zip|rar|au3";所支持的附件格式,用|隔开
Global Const $gc_AttachFile_FolderName = "Attach"
Global Const $gc_AttachFile_Folder = $gc_StoreFolder & $gc_AttachFile_FolderName & "\"
_CheckCssStyle();初始化 下载 css 文件,css中的 背景图片没做下载
_DownLoad_TopicBySN(77503);下载编号为 77503 的文章
Func _DownLoad_TopicBySN($s_SN)
Local $s_File, $s_Valve = 1
If Not(FileExists($gc_HtmlFileFolder & $s_SN & ".html") And StringLen(_ReadTopicTitleInfoBySN($s_SN))) Then
Local $s_Return = _LocalCssStyle(_DeleteJsScript(_INetGetSource("http://www.autoitscript.com/forum/index.php?act=Print&client=html&f=9&t=" & $s_SN)))
Local $s_Title = StringRegExp($s_Return, "<h3>.*?_.*?_\s*(.*?)\s*</h3>", 3)
If UBound($s_Title) > 0 Then
_RecordTopicInfo($s_Title, $s_SN)
Else
$s_Valve = 0
EndIf
Else
$s_Return = FileRead($gc_HtmlFileFolder & $s_SN & ".html")
EndIf
$s_Return = _LocalHtmlImages($s_Return)
$s_Return = _LocalHtmlAttachFile($s_Return)
$s_File = FileOpen($gc_HtmlFileFolder & $s_SN & ".html", 2+8)
FileWrite($s_File, $s_Return)
FileClose($s_File)
EndFunc
Func _INetGetSource($sInetUrl ,$sCodeFormat = 1)
Local $sReturn = _INetGet_Http($sInetUrl)
If IsBinary($sReturn) Then
$sReturn = BinaryToString($sReturn ,$sCodeFormat)
EndIf
Return $sReturn
EndFunc
Func _INetGet_Http($sInetUrl)
Local $WinINet_hDLL = DllOpen("wininet.dll")
If @error Or $WinINet_hDLL = -1 Then Return SetError(1, 0, 0)
Local $FTAcceptTypes = '*/*', $FTAgent = 'Explorer', $FTUserName = '', $FTPort = 80, $FTPostQuery = 'GET', $FTReferer = ''
Local $hSession, $hConnect, $hRequest
Local $sNetName = StringRegExp($sInetUrl, 'http://(.*?)/(.*)', 3)
Local $sHostName = $sNetName, $sFileName = $sNetName
Local $hSession = DllCall($WinINet_hDLL,"ptr","InternetOpenW","wstr","","dword",1,"ptr",0,"ptr",0,"dword",0x04000000)
If @error Or Not $hSession Then Return SetError(2, 0, '')
Local $hConnect = DllCall($WinINet_hDLL,"ptr","InternetConnectW","ptr",$hSession,"wstr",$sHostName,"dword",0,"ptr",0,"ptr",0,"dword",3,"dword",0,"ptr",0)
If @error Or Not $hConnect Then Return SetError(3, 0, '')
Local $hRequest = DllCall($WinINet_hDLL,"ptr","HttpOpenRequestW","ptr",$hConnect,"wstr",$FTPostQuery,"wstr",$sFileName,"wstr","HTTP/1.1","ptr",0,"ptr",0,"dword",0,"ptr",0)
If @error Or Not $hRequest Then Return SetError(4, 0, '')
DllCall($WinINet_hDLL,"int","HttpSendRequestW","ptr",$hRequest,"ptr",0,"dword",0,"ptr",0,"dword",0)
If @error Then Return SetError(5, 0, '')
Local $iNumberOfBytesToRead = 128
Local $tNumberOfBytesRead = DllStructCreate("dword")
Local $tBuffer = DllStructCreate("byte[" & $iNumberOfBytesToRead & "]")
Local $nError = 0, $nExtended = 1, $sReturn = '', $tReturn = ''
While BitAND($nError = 0,$nExtended > 0)
$tReturn = DllCall($WinINet_hDLL,"int","InternetReadFile","ptr",$hRequest,"ptr",DllStructGetPtr($tBuffer),"dword",$iNumberOfBytesToRead,"ptr",DllStructGetPtr($tNumberOfBytesRead))
$nError = @error
$nExtended = DllStructGetData($tNumberOfBytesRead, 1)
$sReturn &= BinaryMid(DllStructGetData($tBuffer, 1), 1, $nExtended)
WEnd
DllCall($WinINet_hDLL,"int","InternetCloseHandle","ptr",$hRequest )
DllCall($WinINet_hDLL,"int","InternetCloseHandle","ptr",$hConnect)
DllCall($WinINet_hDLL,"int","InternetCloseHandle","ptr",$hSession)
$sReturn = '0x' & StringRegExpReplace($sReturn, '0x', '')
DllClose($WinINet_hDLL)
Return Binary($sReturn)
EndFunc
Func _DeleteJsScript($s_HtmlSource)
Return StringRegExpReplace($s_HtmlSource, "(?i)(<script[^\xff]*?</script>)", "")
EndFunc
Func _LocalCssStyle($s_HtmlSource)
Return StringRegExpReplace($s_HtmlSource, "(?i)(<style[^\xff]*?</style>)", '<link rel="stylesheet" type="text/css" href="../css.css" />')
EndFunc
Func _ReplaceForLink($s_Link)
Return StringRegExpReplace($s_Link, "&", "&")
EndFunc
Func _LocalHtmlImages($s_HtmlSource)
Local $s_Return = StringRegExp($s_HtmlSource, '(?i)<img src="(http://www\.autoitscript\.com/forum/([^"]*))"', 4), $s_Temp, $s_File, $s_Valve, $s_Path
For $s_I = 0 To UBound($s_Return)-1
$s_Temp = $s_Return[$s_I]
$s_Path = StringReplace($gc_StoreFolder & $s_Temp, "/", "\")
If Not FileExists($s_Path) Then
$s_Valve = 1
$s_Temp = _INetGet_Http($s_Temp)
If BinaryLen($s_Temp) > 0 Then
$s_File = FileOpen($s_Path, 2+8+16)
FileWrite($s_File, $s_Temp)
FileClose($s_File)
Else
$s_Valve = 0
EndIf
Else
$s_Valve = 1
EndIf
If $s_Valve Then
$s_HtmlSource = StringReplace($s_HtmlSource, $s_Temp, '<img src="../' & $s_Temp & '"')
ConsoleWrite($s_Temp & @CRLF)
ConsoleWrite('<img src="../' & $s_Temp & '"' & @CRLF)
EndIf
Next
Return $s_HtmlSource
EndFunc
Func _LocalHtmlAttachFile($s_HtmlSource)
Local $s_Return = StringRegExp($s_HtmlSource, '(?i)<a[^>]*href="(http://www\.autoitscript\.com/forum/[^"]*?)"[^>]*>(.*?\.(?i:' & $gc_AttachFileExt & '))</a>', 4), $s_Temp, $s_File, $s_Valve, $s_Path
For $s_I = 0 To UBound($s_Return)-1
$s_Temp = $s_Return[$s_I]
$s_Path = $gc_AttachFile_Folder & $s_Temp
If Not FileExists($s_Path) Then
$s_Valve = 1
$s_Temp = _INetGet_Http(_ReplaceForLink($s_Temp))
If BinaryLen($s_Temp) > 0 Then
$s_File = FileOpen($s_Path, 2+8+16)
FileWrite($s_File, $s_Temp)
FileClose($s_File)
Else
$s_Valve = 0
EndIf
Else
$s_Valve = 1
EndIf
If $s_Valve Then
$s_Temp = $s_Return[$s_I]
$s_Temp = StringLeft($s_Temp, StringInStr($s_Temp, $s_Temp) - 1)
$s_HtmlSource = StringReplace($s_HtmlSource, $s_Temp & $s_Temp, $s_Temp & '../' & $gc_AttachFile_FolderName & "/" & $s_Temp)
ConsoleWrite($s_Temp & $s_Temp & @CRLF)
ConsoleWrite($s_Temp & '../' & $gc_AttachFile_FolderName & "/" & $s_Temp & @CRLF)
EndIf
Next
Return $s_HtmlSource
EndFunc
Func _RecordTopicInfo($s_Title, $s_SN)
IniWrite($gc_TopicList_File, $gc_TopicList_Section, $s_SN, $s_Title)
EndFunc
Func _ReadTopicTitleInfoBySN($s_SN)
Local $s_Return = IniRead($gc_TopicList_File, $gc_TopicList_Section, $s_SN, ""), $s_Error = 0
If StringLen($s_Return) = 0 Then
$s_Error = 1
EndIf
Return SetError($s_Error, 0, $s_Return)
EndFunc
Func _CheckCssStyle()
Local $s_Path = $gc_StoreFolder & "css.css"
If Not FileExists($s_Path) Then
Local $s_Return = _INetGet_Http("http://www.autoitscript.com/forum/style_images/css_14.css")
If BinaryLen($s_Return) > 0 Then
Local $s_File = FileOpen($s_Path, 2+8+16)
FileWrite($s_File, $s_Return)
FileClose($s_File)
EndIf
EndIf
EndFunc
革命尚未成功,同志还需努力
:face (12):
[ 本帖最后由 lynfr8 于 2009-4-19 01:25 编辑 ]
用这个获取列表,应该就能完成对Example Scripts 的文章及附件下载了。
#include <AutoitScriptTopicShow.au3>
Global Const $gc_Addr = "http://www.autoitscript.com/forum/index.php?showforum=9&prune_day=100&sort_by=Z-A&sort_key=last_post&topicfilter=all&st="
Global $gc_Max = 1, $gc_ThisPageNumber = 1, $gc_Return, $gc_Valve = 1, $gc_Number = 1, $gc_Temp
While $gc_Valve
$gc_Return = _INetGetSource($gc_Addr & ($gc_ThisPageNumber-1)*20)
If $gc_Number = 1 Then
$gc_Max = StringRegExp($gc_Return, '<span class="pagelinklast"><a href=".*?(\d+)"', 3)
If UBound($gc_Max) Then
$gc_Max = $gc_Max / 20
$gc_Number = 0
Else
$gc_Max = 1
$gc_Valve = 0
EndIf
EndIf
If $gc_ThisPageNumber = $gc_Max Then $gc_Valve = 0
$gc_Return = StringRegExp($gc_Return, '<a.*?href="http://www\.autoitscript\.com/forum/index\.php\?showtopic=(\d+)".*?>([^<>]*)</a>', 4)
For $gc_I = 0 To UBound($gc_Return) - 1
$gc_Temp = $gc_Return[$gc_I]
ConsoleWrite("[" &$gc_Temp & "] " & $gc_Temp & @CRLF)
_DownLoad_TopicBySN($gc_Temp)
Next
$gc_ThisPageNumber += 1
WEnd
回复 文白
本帖最后由 lynfr8 于 2009-6-27 01:33 编辑今天的加分已经用光了,要不我想给你加够100000分...
---------------------------------------------------------------------------------------------------------
你的代码写得让自己感觉自己太菜了(学海无涯苦作舟啊...)
因为自己没有汇编语言基础,纯属处于爱好而学au3的
所以我的思路也比较简单:
手工从遨游的源码分析器viewpage获取文章名和链接---filereadline逐行读取数据---通过正则获取最终下载链接----InetGet下载html文件-----第一行的连接下载完继续读第二行,获取最终文件名和链接继续下载,其中文件名和最终链接都是变量,通过循环不断更新下载,直至文本所有行读取完毕
优点:可以百分百保证下载是自己所需文件,并且文件名精确、不会重复,上传到服务器即可马上开始下载
其次下载完即可导入制作chm,无需再修复命名
缺点:需要人工整理链接,麻烦(不过本人可以克服)
---------------------------------------------------------------------------------------------------------
部分关键代码:
文件列表链接.txt
{ps:试验文本,现在显示部分链接,需要提取的文件名和对应链接,这样可以保证下载的文件名与文章一一对应,方便制作chm}1 Standard UDF Library http://www.autoitscript.com/forum/index.php?showtopic=62035
2 javascript:multi_page_jump('http://www.autoitscript.com/forum/index.php?showtopic=62035', 37, 15 ); javascript:multi_page_jump('http://www.autoitscript.com/forum/index.php?showtopic=62035', 37, 15 );
3 1 http://www.autoitscript.com/forum/index.php?showtopic=62035&st=0&start=0
4 2 http://www.autoitscript.com/forum/index.php?showtopic=62035&st=15&start=15
5 3 http://www.autoitscript.com/forum/index.php?showtopic=62035&st=30&start=30
6 Last post by: http://www.autoitscript.com/forum/index.php?showtopic=62035&view=getlastpost
7 Welcome to AutoIt 1-2-3 http://www.autoitscript.com/forum/index.php?showtopic=21048
8 javascript:multi_page_jump('http://www.autoitscript.com/forum/index.php?showtopic=21048', 845, 15 ); javascript:multi_page_jump('http://www.autoitscript.com/forum/index.php?showtopic=21048', 845, 15 );
9 1 http://www.autoitscript.com/forum/index.php?showtopic=21048&st=0&start=0
10 2 http://www.autoitscript.com/forum/index.php?showtopic=21048&st=15&start=15
11 3 http://www.autoitscript.com/forum/index.php?showtopic=21048&st=30&start=30
12 ? 57 http://www.autoitscript.com/forum/index.php?showtopic=21048&st=840&start=840
13 Last post by: http://www.autoitscript.com/forum/index.php?showtopic=21048&view=getlastpost
14 FTP.au3 http://www.autoitscript.com/forum/index.php?showtopic=12473
15 javascript:multi_page_jump('http://www.autoitscript.com/forum/index.php?showtopic=12473', 257, 15 ); javascript:multi_page_jump('http://www.autoitscript.com/forum/index.php?showtopic=12473', 257, 15 );
16 1 http://www.autoitscript.com/forum/index.php?showtopic=12473&st=0&start=0
17 2 http://www.autoitscript.com/forum/index.php?showtopic=12473&st=15&start=15
18 3 http://www.autoitscript.com/forum/index.php?showtopic=12473&st=30&start=30
19 ? 18 http://www.autoitscript.com/forum/index.php?showtopic=12473&st=255&start=255
20 Last post by: http://www.autoitscript.com/forum/index.php?showtopic=12473&view=getlastpost
$file = FileOpen("文件列表链接.txt", 0)
If $file = -1 Then
MsgBox(0, "错误", "不能打开文件.")
Exit
EndIf
dim $count;
; 读文本行直到文件结束
While 1
$line = FileReadLine($file)
If @error = -1 Then ExitLoop
$array = StringSplit($line, " ");以空格为分界分割三段字符
$count = $count + 1
dim $title = ""
dim $name = ""
dim $link
for $i = 3 to UBound($array) - 3
$title = $title & " " & $array[$i];获取文件名
$name = $name & "_" & $array[$i]
Next
$result = StringCompare($array & " " & $array, 'Last post')
;判断跳转,排除
$jump = StringLeft($array, 10);
$isJump = StringCompare($jump, "javascript")
if $isJump = 0 Then
ContinueLoop
EndIf
;判断数字,排除
$isDigit = StringIsAlNum($title)
if $isDigit = 1 Then
MsgBox(0, "Title", $title)
ContinueLoop
EndIf
$link = $array;获取链接
$Getlinks = StringReplace($link, "showtopic", "act=Print&client=html&f=9&t");获取下载打印版链接
If $result = 0 Then
ContinueLoop
Else
InetGet($Getlinks, "C:\autoitscript\" & $title & ".html", 1, 0);开始下载
EndIf
有些地方感觉不是很完美(例如下载的文章都是数字为文件名的,要导入chm需要再修改?那就麻烦了,还有附件命名也是...这个到时再请教文白,相信有办法可自动完成相关任务的),总而言之,写得太经典了!!!
【此问题已解决,在制作文章索引时候写段代码利用相对路径重新定位*(数字).html即可】;将文白代码生成的TopicList.html改后缀名为txt,匹配成相对路径后写入TopicListnew.txt
$file = FileOpen("TopicList.txt", 0)
dim $count
While 1
$line = FileReadLine($file)
$array = StringSplit($line, "=", 1)
$count = $count + 1
for $i = 2 to $array
$title = $array
$link = $array
Next
$file2 = FileOpen("TopicListnew.txt", 1)
$a='<a href'&'='
$b='''html/'
$c='.html'''
$d='>'&$title&'</a></br>'
$e=$a&$b&$link&$c&' '&$d
FileWriteLine($file2, $e)
TrayTip("次数", $count, 0)
Wend
1.下载附件时,可能存在名称相同的文件。所以文件名要加前缀。
2.几处正则,需要再完善。
精力有限,只能做到这步。
下步的汉化,才是真正的难题。还需热心Auer、网友们一起努力。 这个。。。真是被感动了:face (9): 文白真的很强!呵呵,膜拜一下! 太强憾了,一定要顶起,:face (29): :face (29): :face (29): 更新了<>,修改了<>支持中断后,继续上次工作,提升了文章分类的准确性,但工作量增大很多.从过去的6700增加到100000.
#include <AutoitScriptTopicShow.au3>
$gc_ForumBlock_Name = "Example Scripts"; 设置允许下载的板块名称
Global Const $gc_TopicProgram_FileName = "TopicProgram.ini"
Global Const $gc_Addr = "http://www.autoitscript.com/forum/index.php?showforum=9&prune_day=100&sort_by=Z-A&sort_key=last_post&topicfilter=all&st="
Global Const $gc_DefaultNum =
Global $gc_StartNum = _TopicProgram_Read(0), $gc_EndNum = _TopicProgram_Read(1), $gc_ThisNum = _TopicProgram_Read(2)
For $gc_I = $gc_StartNum To $gc_EndNum
If $gc_I = $gc_StartNum Then $gc_I = $gc_ThisNum
$gc_ThisNum = $gc_I
_DownLoad_TopicBySN($gc_ThisNum)
_TopicProgram_Write($gc_StartNum, $gc_EndNum, $gc_ThisNum)
ConsoleWrite($gc_I & @CRLF)
Next
_CreateHtmlList()
Func _TopicProgram_Read($s_Flags)
Local $s_Return
Switch $s_Flags
Case 0
$s_Return = IniRead($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "StartNum", -1)
If $s_Return = -1 Then $s_Return = $gc_DefaultNum
Case 1
$s_Return = IniRead($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "EndNum", -1)
If $s_Return = -1 Then $s_Return = $gc_DefaultNum
Case 2
$s_Return = IniRead($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "ThisNum", -1)
If $s_Return = -1 Then $s_Return = $gc_DefaultNum
EndSwitch
Return $s_Return
EndFunc
Func _TopicProgram_Write($s_StartNum, $s_EndNum, $s_ThisNum)
IniWrite($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "StartNum", $s_StartNum)
IniWrite($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "EndNum", $s_EndNum)
IniWrite($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "ThisNum", $s_ThisNum)
EndFunc
#include <AutoitScriptTopicShow.au3>
$gc_ForumBlock_Name = "Example Scripts"; 设置允许下载的板块名称
Global Const $gc_TopicProgram_FileName = "TopicProgram.ini"
Global Const $gc_Addr = "http://www.autoitscript.com/forum/index.php?showforum=9&prune_day=100&sort_by=Z-A&sort_key=last_post&topicfilter=all&st="
Global Const $gc_DefaultNum =
Global $gc_StartNum = _TopicProgram_Read(0), $gc_EndNum = _TopicProgram_Read(1), $gc_ThisNum = _TopicProgram_Read(2)
For $gc_I = $gc_StartNum To $gc_EndNum
If $gc_I = $gc_StartNum Then $gc_I = $gc_ThisNum
$gc_ThisNum = $gc_I
_DownLoad_TopicBySN($gc_ThisNum)
_TopicProgram_Write($gc_StartNum, $gc_EndNum, $gc_ThisNum)
ConsoleWrite($gc_I & @CRLF)
Next
_CreateHtmlList()
Func _TopicProgram_Read($s_Flags)
Local $s_Return
Switch $s_Flags
Case 0
$s_Return = IniRead($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "StartNum", -1)
If $s_Return = -1 Then $s_Return = $gc_DefaultNum
Case 1
$s_Return = IniRead($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "EndNum", -1)
If $s_Return = -1 Then $s_Return = $gc_DefaultNum
Case 2
$s_Return = IniRead($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "ThisNum", -1)
If $s_Return = -1 Then $s_Return = $gc_DefaultNum
EndSwitch
Return $s_Return
EndFunc
Func _TopicProgram_Write($s_StartNum, $s_EndNum, $s_ThisNum)
IniWrite($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "StartNum", $s_StartNum)
IniWrite($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "EndNum", $s_EndNum)
IniWrite($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "ThisNum", $s_ThisNum)
EndFunc 文白真是太强了,看来这个工程顺利拿下来真是小case了
开心啊!!!! 推荐楼主用网文快捕下载,它能在线保存html网页,并且自己可以随意修改目录层次,可制作成chm格式电子书。
可能需要注册,楼主可以用4.36版的,网上有网友发布的可用注册号,如果找不到的话可联系我,我可将软件和序列号都提供给你。
Q:171219414 不知LZ是收集所有回帖还是只收集第一页(只有指向部分回帖页的链接),因为有些回帖也挺有价值的,是否考虑过对回帖数较多的帖子作特别处理。。
[ 本帖最后由 foosea 于 2009-4-20 12:14 编辑 ] 顶下
想参加,为论坛做点事
不知道要该做些什么
我的QQ:61577962 5.1快到,期待着能如期完成