superflq 发表于 2009-4-18 16:56:52

支持楼主的热情,!!!

文白 发表于 2009-4-18 18:53:32


通过topic的编号下载单篇幅文章,附件,缩略图到本地。
Global Const $gc_StoreFolder = @ScriptDir & "\Autoitscript\";主目录

Global Const $gc_HtmlFileFolder = $gc_StoreFolder & "html\";html 页面目录

Global Const $gc_TopicList_File = $gc_StoreFolder & "TopicList.ini";文章列表
Global Const $gc_TopicList_Section = "TopicList"

Global Const $gc_AttachFileExt = "zip|rar|au3";所支持的附件格式,用|隔开
Global Const $gc_AttachFile_FolderName = "Attach"
Global Const $gc_AttachFile_Folder = $gc_StoreFolder & $gc_AttachFile_FolderName & "\"

_CheckCssStyle();初始化 下载 css 文件,css中的 背景图片没做下载

_DownLoad_TopicBySN(77503);下载编号为 77503 的文章

Func _DownLoad_TopicBySN($s_SN)
        Local $s_File, $s_Valve = 1
        If Not(FileExists($gc_HtmlFileFolder & $s_SN & ".html") And StringLen(_ReadTopicTitleInfoBySN($s_SN))) Then
                Local $s_Return = _LocalCssStyle(_DeleteJsScript(_INetGetSource("http://www.autoitscript.com/forum/index.php?act=Print&client=html&f=9&t=" & $s_SN)))
                Local $s_Title = StringRegExp($s_Return, "<h3>.*?_.*?_\s*(.*?)\s*</h3>", 3)
                If UBound($s_Title) > 0 Then
                        _RecordTopicInfo($s_Title, $s_SN)
                Else
                        $s_Valve = 0
                EndIf
        Else
                $s_Return = FileRead($gc_HtmlFileFolder & $s_SN & ".html")
        EndIf
        $s_Return = _LocalHtmlImages($s_Return)
        $s_Return = _LocalHtmlAttachFile($s_Return)
        $s_File = FileOpen($gc_HtmlFileFolder & $s_SN & ".html", 2+8)
        FileWrite($s_File, $s_Return)
        FileClose($s_File)
EndFunc

Func _INetGetSource($sInetUrl ,$sCodeFormat = 1)
        Local $sReturn = _INetGet_Http($sInetUrl)
        If IsBinary($sReturn) Then
                $sReturn = BinaryToString($sReturn ,$sCodeFormat)
        EndIf
        Return $sReturn
EndFunc

Func _INetGet_Http($sInetUrl)
        Local $WinINet_hDLL = DllOpen("wininet.dll")
        If @error Or $WinINet_hDLL = -1 Then Return SetError(1, 0, 0)
       
        Local $FTAcceptTypes = '*/*', $FTAgent = 'Explorer', $FTUserName = '', $FTPort = 80, $FTPostQuery = 'GET', $FTReferer = ''
        Local $hSession, $hConnect, $hRequest
        Local $sNetName = StringRegExp($sInetUrl, 'http://(.*?)/(.*)', 3)
        Local $sHostName = $sNetName, $sFileName = $sNetName
       
        Local $hSession = DllCall($WinINet_hDLL,"ptr","InternetOpenW","wstr","","dword",1,"ptr",0,"ptr",0,"dword",0x04000000)
        If @error Or Not $hSession Then Return SetError(2, 0, '')
       
        Local $hConnect = DllCall($WinINet_hDLL,"ptr","InternetConnectW","ptr",$hSession,"wstr",$sHostName,"dword",0,"ptr",0,"ptr",0,"dword",3,"dword",0,"ptr",0)
        If @error Or Not $hConnect Then Return SetError(3, 0, '')
       
        Local $hRequest = DllCall($WinINet_hDLL,"ptr","HttpOpenRequestW","ptr",$hConnect,"wstr",$FTPostQuery,"wstr",$sFileName,"wstr","HTTP/1.1","ptr",0,"ptr",0,"dword",0,"ptr",0)
        If @error Or Not $hRequest Then Return SetError(4, 0, '')
       
        DllCall($WinINet_hDLL,"int","HttpSendRequestW","ptr",$hRequest,"ptr",0,"dword",0,"ptr",0,"dword",0)
        If @error Then Return SetError(5, 0, '')
       
        Local $iNumberOfBytesToRead = 128
        Local $tNumberOfBytesRead = DllStructCreate("dword")
        Local $tBuffer = DllStructCreate("byte[" & $iNumberOfBytesToRead & "]")
        Local $nError = 0, $nExtended = 1, $sReturn = '', $tReturn = ''
       
        While BitAND($nError = 0,$nExtended > 0)
                $tReturn = DllCall($WinINet_hDLL,"int","InternetReadFile","ptr",$hRequest,"ptr",DllStructGetPtr($tBuffer),"dword",$iNumberOfBytesToRead,"ptr",DllStructGetPtr($tNumberOfBytesRead))
                $nError = @error
                $nExtended = DllStructGetData($tNumberOfBytesRead, 1)
                $sReturn &= BinaryMid(DllStructGetData($tBuffer, 1), 1, $nExtended)
        WEnd
       
        DllCall($WinINet_hDLL,"int","InternetCloseHandle","ptr",$hRequest )
        DllCall($WinINet_hDLL,"int","InternetCloseHandle","ptr",$hConnect)
        DllCall($WinINet_hDLL,"int","InternetCloseHandle","ptr",$hSession)
       
        $sReturn = '0x' & StringRegExpReplace($sReturn, '0x', '')
        DllClose($WinINet_hDLL)
        Return Binary($sReturn)
EndFunc

Func _DeleteJsScript($s_HtmlSource)
        Return StringRegExpReplace($s_HtmlSource, "(?i)(<script[^\xff]*?</script>)", "")
EndFunc

Func _LocalCssStyle($s_HtmlSource)
        Return StringRegExpReplace($s_HtmlSource, "(?i)(<style[^\xff]*?</style>)", '<link rel="stylesheet" type="text/css" href="../css.css" />')
EndFunc

Func _ReplaceForLink($s_Link)
        Return StringRegExpReplace($s_Link, "&amp;", "&")
EndFunc

Func _LocalHtmlImages($s_HtmlSource)
        Local $s_Return = StringRegExp($s_HtmlSource, '(?i)<img src="(http://www\.autoitscript\.com/forum/([^"]*))"', 4), $s_Temp, $s_File, $s_Valve, $s_Path
        For $s_I = 0 To UBound($s_Return)-1
                $s_Temp = $s_Return[$s_I]
                $s_Path = StringReplace($gc_StoreFolder & $s_Temp, "/", "\")
                If Not FileExists($s_Path) Then
                        $s_Valve = 1
                        $s_Temp = _INetGet_Http($s_Temp)
                        If BinaryLen($s_Temp) > 0 Then
                                $s_File = FileOpen($s_Path, 2+8+16)
                                FileWrite($s_File, $s_Temp)
                                FileClose($s_File)
                        Else
                                $s_Valve = 0
                        EndIf
                Else
                        $s_Valve = 1
                EndIf
                If $s_Valve Then
                        $s_HtmlSource = StringReplace($s_HtmlSource, $s_Temp, '<img src="../' & $s_Temp & '"')
                        ConsoleWrite($s_Temp & @CRLF)
                        ConsoleWrite('<img src="../' & $s_Temp & '"' & @CRLF)
                EndIf
        Next
        Return $s_HtmlSource
EndFunc

Func _LocalHtmlAttachFile($s_HtmlSource)
        Local $s_Return = StringRegExp($s_HtmlSource, '(?i)<a[^>]*href="(http://www\.autoitscript\.com/forum/[^"]*?)"[^>]*>(.*?\.(?i:' & $gc_AttachFileExt & '))</a>', 4), $s_Temp, $s_File, $s_Valve, $s_Path
        For $s_I = 0 To UBound($s_Return)-1
                $s_Temp = $s_Return[$s_I]
                $s_Path = $gc_AttachFile_Folder & $s_Temp
                If Not FileExists($s_Path) Then
                        $s_Valve = 1
                        $s_Temp = _INetGet_Http(_ReplaceForLink($s_Temp))
                        If BinaryLen($s_Temp) > 0 Then
                                $s_File = FileOpen($s_Path, 2+8+16)
                                FileWrite($s_File, $s_Temp)
                                FileClose($s_File)
                        Else
                                $s_Valve = 0
                        EndIf
                Else
                        $s_Valve = 1
                EndIf
                If $s_Valve Then
                        $s_Temp = $s_Return[$s_I]
                        $s_Temp = StringLeft($s_Temp, StringInStr($s_Temp, $s_Temp) - 1)
                        $s_HtmlSource = StringReplace($s_HtmlSource, $s_Temp & $s_Temp, $s_Temp & '../' & $gc_AttachFile_FolderName & "/" & $s_Temp)
                        ConsoleWrite($s_Temp & $s_Temp & @CRLF)
                        ConsoleWrite($s_Temp & '../' & $gc_AttachFile_FolderName & "/" & $s_Temp & @CRLF)
                EndIf
        Next
        Return $s_HtmlSource
EndFunc

Func _RecordTopicInfo($s_Title, $s_SN)
        IniWrite($gc_TopicList_File, $gc_TopicList_Section, $s_SN, $s_Title)
EndFunc

Func _ReadTopicTitleInfoBySN($s_SN)
        Local $s_Return = IniRead($gc_TopicList_File, $gc_TopicList_Section, $s_SN, ""), $s_Error = 0
        If StringLen($s_Return) = 0 Then
                $s_Error = 1
        EndIf
        Return SetError($s_Error, 0, $s_Return)
EndFunc

Func _CheckCssStyle()
        Local $s_Path = $gc_StoreFolder & "css.css"
        If Not FileExists($s_Path) Then
                Local $s_Return = _INetGet_Http("http://www.autoitscript.com/forum/style_images/css_14.css")
                If BinaryLen($s_Return) > 0 Then
                        Local $s_File = FileOpen($s_Path, 2+8+16)
                        FileWrite($s_File, $s_Return)
                        FileClose($s_File)
                EndIf
        EndIf
EndFunc

lynfr8 发表于 2009-4-18 19:11:21

革命尚未成功,同志还需努力
:face (12):

[ 本帖最后由 lynfr8 于 2009-4-19 01:25 编辑 ]

文白 发表于 2009-4-18 19:24:41


用这个获取列表,应该就能完成对Example Scripts 的文章及附件下载了。
#include <AutoitScriptTopicShow.au3>
Global Const $gc_Addr = "http://www.autoitscript.com/forum/index.php?showforum=9&prune_day=100&sort_by=Z-A&sort_key=last_post&topicfilter=all&st="
Global $gc_Max = 1, $gc_ThisPageNumber = 1, $gc_Return, $gc_Valve = 1, $gc_Number = 1, $gc_Temp

While $gc_Valve
        $gc_Return = _INetGetSource($gc_Addr & ($gc_ThisPageNumber-1)*20)
        If $gc_Number = 1 Then
                $gc_Max = StringRegExp($gc_Return, '<span class="pagelinklast"><a href=".*?(\d+)"', 3)
                If UBound($gc_Max) Then
                        $gc_Max = $gc_Max / 20
                        $gc_Number = 0
                Else
                        $gc_Max = 1
                        $gc_Valve = 0
                EndIf
        EndIf
        If $gc_ThisPageNumber = $gc_Max Then $gc_Valve = 0
        $gc_Return = StringRegExp($gc_Return, '<a.*?href="http://www\.autoitscript\.com/forum/index\.php\?showtopic=(\d+)".*?>([^<>]*)</a>', 4)
        For $gc_I = 0 To UBound($gc_Return) - 1
                $gc_Temp = $gc_Return[$gc_I]
                ConsoleWrite("[" &$gc_Temp & "] " & $gc_Temp & @CRLF)
                _DownLoad_TopicBySN($gc_Temp)
        Next
        $gc_ThisPageNumber += 1
WEnd

lynfr8 发表于 2009-4-18 20:30:42

回复 文白

本帖最后由 lynfr8 于 2009-6-27 01:33 编辑

今天的加分已经用光了,要不我想给你加够100000分...

---------------------------------------------------------------------------------------------------------
你的代码写得让自己感觉自己太菜了(学海无涯苦作舟啊...)
因为自己没有汇编语言基础,纯属处于爱好而学au3的
所以我的思路也比较简单:
手工从遨游的源码分析器viewpage获取文章名和链接---filereadline逐行读取数据---通过正则获取最终下载链接----InetGet下载html文件-----第一行的连接下载完继续读第二行,获取最终文件名和链接继续下载,其中文件名和最终链接都是变量,通过循环不断更新下载,直至文本所有行读取完毕
优点:可以百分百保证下载是自己所需文件,并且文件名精确、不会重复,上传到服务器即可马上开始下载
          其次下载完即可导入制作chm,无需再修复命名
缺点:需要人工整理链接,麻烦(不过本人可以克服)

---------------------------------------------------------------------------------------------------------
部分关键代码:

文件列表链接.txt

{ps:试验文本,现在显示部分链接,需要提取的文件名和对应链接,这样可以保证下载的文件名与文章一一对应,方便制作chm}1 Standard UDF Library http://www.autoitscript.com/forum/index.php?showtopic=62035
2 javascript:multi_page_jump('http://www.autoitscript.com/forum/index.php?showtopic=62035', 37, 15 ); javascript:multi_page_jump('http://www.autoitscript.com/forum/index.php?showtopic=62035', 37, 15 );
3 1 http://www.autoitscript.com/forum/index.php?showtopic=62035&st=0&start=0
4 2 http://www.autoitscript.com/forum/index.php?showtopic=62035&st=15&start=15
5 3 http://www.autoitscript.com/forum/index.php?showtopic=62035&st=30&start=30
6 Last post by: http://www.autoitscript.com/forum/index.php?showtopic=62035&view=getlastpost
7 Welcome to AutoIt 1-2-3 http://www.autoitscript.com/forum/index.php?showtopic=21048
8 javascript:multi_page_jump('http://www.autoitscript.com/forum/index.php?showtopic=21048', 845, 15 ); javascript:multi_page_jump('http://www.autoitscript.com/forum/index.php?showtopic=21048', 845, 15 );
9 1 http://www.autoitscript.com/forum/index.php?showtopic=21048&st=0&start=0
10 2 http://www.autoitscript.com/forum/index.php?showtopic=21048&st=15&start=15
11 3 http://www.autoitscript.com/forum/index.php?showtopic=21048&st=30&start=30
12 ? 57 http://www.autoitscript.com/forum/index.php?showtopic=21048&st=840&start=840
13 Last post by: http://www.autoitscript.com/forum/index.php?showtopic=21048&view=getlastpost
14 FTP.au3 http://www.autoitscript.com/forum/index.php?showtopic=12473
15 javascript:multi_page_jump('http://www.autoitscript.com/forum/index.php?showtopic=12473', 257, 15 ); javascript:multi_page_jump('http://www.autoitscript.com/forum/index.php?showtopic=12473', 257, 15 );
16 1 http://www.autoitscript.com/forum/index.php?showtopic=12473&st=0&start=0
17 2 http://www.autoitscript.com/forum/index.php?showtopic=12473&st=15&start=15
18 3 http://www.autoitscript.com/forum/index.php?showtopic=12473&st=30&start=30
19 ? 18 http://www.autoitscript.com/forum/index.php?showtopic=12473&st=255&start=255
20 Last post by: http://www.autoitscript.com/forum/index.php?showtopic=12473&view=getlastpost


$file = FileOpen("文件列表链接.txt", 0)
If $file = -1 Then
MsgBox(0, "错误", "不能打开文件.")
Exit
EndIf
dim $count;
; 读文本行直到文件结束
While 1
$line = FileReadLine($file)
If @error = -1 Then ExitLoop
$array = StringSplit($line, " ");以空格为分界分割三段字符
$count = $count + 1
dim $title = ""
dim $name = ""
dim $link
for $i = 3 to UBound($array) - 3
$title = $title & " " & $array[$i];获取文件名
$name = $name & "_" & $array[$i]
Next
$result = StringCompare($array & " " & $array, 'Last post')
;判断跳转,排除
$jump = StringLeft($array, 10);
$isJump = StringCompare($jump, "javascript")
if $isJump = 0 Then
ContinueLoop
EndIf
;判断数字,排除
$isDigit = StringIsAlNum($title)
if $isDigit = 1 Then
MsgBox(0, "Title", $title)
ContinueLoop
EndIf
$link = $array;获取链接
$Getlinks = StringReplace($link, "showtopic", "act=Print&client=html&f=9&t");获取下载打印版链接
If $result = 0 Then
ContinueLoop
Else
InetGet($Getlinks, "C:\autoitscript\" & $title & ".html", 1, 0);开始下载
EndIf



有些地方感觉不是很完美(例如下载的文章都是数字为文件名的,要导入chm需要再修改?那就麻烦了,还有附件命名也是...这个到时再请教文白,相信有办法可自动完成相关任务的),总而言之,写得太经典了!!!
【此问题已解决,在制作文章索引时候写段代码利用相对路径重新定位*(数字).html即可】;将文白代码生成的TopicList.html改后缀名为txt,匹配成相对路径后写入TopicListnew.txt
$file = FileOpen("TopicList.txt", 0)
dim $count
While 1
$line = FileReadLine($file)
$array = StringSplit($line, "=", 1)
$count = $count + 1
for $i = 2 to $array
$title = $array
$link = $array
Next
$file2 = FileOpen("TopicListnew.txt", 1)
$a='<a href'&'='
$b='''html/'
$c='.html'''
$d='>'&$title&'</a></br>'
$e=$a&$b&$link&$c&' '&$d
FileWriteLine($file2, $e)
TrayTip("次数", $count, 0)
Wend

文白 发表于 2009-4-18 21:59:00

1.下载附件时,可能存在名称相同的文件。所以文件名要加前缀。
2.几处正则,需要再完善。

精力有限,只能做到这步。
下步的汉化,才是真正的难题。还需热心Auer、网友们一起努力。

make2855 发表于 2009-4-18 22:06:34

这个。。。真是被感动了:face (9):

破帽遮颜 发表于 2009-4-19 02:43:19

文白真的很强!呵呵,膜拜一下!

tm7801 发表于 2009-4-19 12:18:35

太强憾了,一定要顶起,:face (29): :face (29): :face (29):

文白 发表于 2009-4-19 22:30:25

更新了<>,修改了<>支持中断后,继续上次工作,提升了文章分类的准确性,但工作量增大很多.从过去的6700增加到100000.
#include <AutoitScriptTopicShow.au3>
$gc_ForumBlock_Name = "Example Scripts"; 设置允许下载的板块名称

Global Const $gc_TopicProgram_FileName = "TopicProgram.ini"
Global Const $gc_Addr = "http://www.autoitscript.com/forum/index.php?showforum=9&prune_day=100&sort_by=Z-A&sort_key=last_post&topicfilter=all&st="
Global Const $gc_DefaultNum =
Global $gc_StartNum = _TopicProgram_Read(0), $gc_EndNum = _TopicProgram_Read(1), $gc_ThisNum = _TopicProgram_Read(2)

For $gc_I = $gc_StartNum To $gc_EndNum
        If $gc_I = $gc_StartNum Then $gc_I = $gc_ThisNum
        $gc_ThisNum = $gc_I
        _DownLoad_TopicBySN($gc_ThisNum)
        _TopicProgram_Write($gc_StartNum, $gc_EndNum, $gc_ThisNum)
        ConsoleWrite($gc_I & @CRLF)
Next

_CreateHtmlList()

Func _TopicProgram_Read($s_Flags)
        Local $s_Return
        Switch $s_Flags
                Case 0
                        $s_Return = IniRead($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "StartNum", -1)
                        If $s_Return = -1 Then $s_Return = $gc_DefaultNum
                Case 1
                        $s_Return = IniRead($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "EndNum", -1)
                        If $s_Return = -1 Then $s_Return = $gc_DefaultNum
                Case 2
                        $s_Return = IniRead($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "ThisNum", -1)
                        If $s_Return = -1 Then $s_Return = $gc_DefaultNum
        EndSwitch
        Return $s_Return
EndFunc

Func _TopicProgram_Write($s_StartNum, $s_EndNum, $s_ThisNum)
        IniWrite($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "StartNum", $s_StartNum)
        IniWrite($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "EndNum", $s_EndNum)
        IniWrite($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "ThisNum", $s_ThisNum)
EndFunc
#include <AutoitScriptTopicShow.au3>
$gc_ForumBlock_Name = "Example Scripts"; 设置允许下载的板块名称

Global Const $gc_TopicProgram_FileName = "TopicProgram.ini"
Global Const $gc_Addr = "http://www.autoitscript.com/forum/index.php?showforum=9&prune_day=100&sort_by=Z-A&sort_key=last_post&topicfilter=all&st="
Global Const $gc_DefaultNum =
Global $gc_StartNum = _TopicProgram_Read(0), $gc_EndNum = _TopicProgram_Read(1), $gc_ThisNum = _TopicProgram_Read(2)

For $gc_I = $gc_StartNum To $gc_EndNum
        If $gc_I = $gc_StartNum Then $gc_I = $gc_ThisNum
        $gc_ThisNum = $gc_I
        _DownLoad_TopicBySN($gc_ThisNum)
        _TopicProgram_Write($gc_StartNum, $gc_EndNum, $gc_ThisNum)
        ConsoleWrite($gc_I & @CRLF)
Next

_CreateHtmlList()

Func _TopicProgram_Read($s_Flags)
        Local $s_Return
        Switch $s_Flags
                Case 0
                        $s_Return = IniRead($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "StartNum", -1)
                        If $s_Return = -1 Then $s_Return = $gc_DefaultNum
                Case 1
                        $s_Return = IniRead($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "EndNum", -1)
                        If $s_Return = -1 Then $s_Return = $gc_DefaultNum
                Case 2
                        $s_Return = IniRead($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "ThisNum", -1)
                        If $s_Return = -1 Then $s_Return = $gc_DefaultNum
        EndSwitch
        Return $s_Return
EndFunc

Func _TopicProgram_Write($s_StartNum, $s_EndNum, $s_ThisNum)
        IniWrite($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "StartNum", $s_StartNum)
        IniWrite($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "EndNum", $s_EndNum)
        IniWrite($gc_StoreFolder & $gc_TopicProgram_FileName, "Program", "ThisNum", $s_ThisNum)
EndFunc

lynfr8 发表于 2009-4-20 11:30:10

文白真是太强了,看来这个工程顺利拿下来真是小case了
开心啊!!!!

wbfans 发表于 2009-4-20 12:05:25

推荐楼主用网文快捕下载,它能在线保存html网页,并且自己可以随意修改目录层次,可制作成chm格式电子书。
可能需要注册,楼主可以用4.36版的,网上有网友发布的可用注册号,如果找不到的话可联系我,我可将软件和序列号都提供给你。
Q:171219414

foosea 发表于 2009-4-20 12:08:48

不知LZ是收集所有回帖还是只收集第一页(只有指向部分回帖页的链接),因为有些回帖也挺有价值的,是否考虑过对回帖数较多的帖子作特别处理。。

[ 本帖最后由 foosea 于 2009-4-20 12:14 编辑 ]

bill-0970 发表于 2009-4-23 19:14:46

顶下

想参加,为论坛做点事

不知道要该做些什么
我的QQ:61577962

lynfr8 发表于 2009-4-25 04:54:59

5.1快到,期待着能如期完成
页: 1 [2] 3 4 5 6
查看完整版本: 代号"autoitscript5.1"英雄帖!【已放下载!汉化酝酿中】