本帖最后由 sex123 于 2014-4-30 18:58 编辑
#include <array.au3>
;仅对doi转跳有意义。
;$a = "http://www.baidu.com"
;Local $a= "http://dx.doi.org/10.1002/jmri.24514" ;wiley三次跳转
;Local $a= "http://dx.doi.org/10.4040/jkan.2013.43.6.746" ;jkan跳转一次
;Local $a= "http://dx.doi.org/10.1007/s00417-011-1652-6" ;springer
Local $a = "http://linkinghub.elsevier.com/retrieve/pii/S0014-4835(11)00104-7"
;Local $a = "http://www.baidu.com"
;Local $a= "http://linkinghub.elsevier.com/retrieve/articleSelectSinglePerm?Redirect=http://www.sciencedirect.com/science/article/pii/S0014483511001047?via%3Dihub"
;Local $a= "http://biomedicaloptics.spiedigitallibrary.org/article.aspx?doi=10.1117/1.3548880" ;springer
;Local $a= "http://dx.doi.org/10.1038/nature12065" ;nature不行的
Global $urlBody
Do
$temphead = _spl($a)
$a = _Get1($a)
;Sleep(200)
;$str=
If $a <> "" Then
$returnHead = $a
Local $result = StringLeft($returnHead, 4)
Local $result2 = StringLeft($returnHead, 1)
If $result = "http" Then
;分析返回值b的字符是什么,是不是http://开头,如果是,则将一级域名分离出来。如果不是,则将一级域名整合。
$temphead = _spl($returnHead)
EndIf
If $result2 = "/" Then
$a = $temphead & $returnHead
EndIf
$b=$a
Else
ExitLoop
EndIf
Until $a = ""
;$finalhead=_spl($b)
MsgBox(0, 0, $temphead)
$urlBodystr = BinaryToString($urlBody)
IniWrite("href.ini", "aa", "bb", $urlBodystr)
Func _Get1($url)
$oHTTP = ObjCreate("WinHttp.WinHttpRequest.5.1")
$oHTTP.SetTimeouts(15000, 15000, 15000, 15000)
$oHTTP.Option(6) = False
$oHTTP.Open("GET", $url, False)
;~ $oHTTP.setRequestHeader('Accept','image/jpeg, application/x-ms-application, image/gif, application/xaml+xml, image/pjpeg, application/x-ms-xbap, */*')
;~ $oHTTP.setRequestHeader('Accept-Language','zh-CN')
$oHTTP.setRequestHeader('User-Agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Media Center PC 4.0; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322)')
$oHTTP.setRequestHeader('Accept-Encoding', 'gzip, deflate')
;~ $oHTTP.setRequestHeader('Host','dx.doi.org')
;~ $oHTTP.setRequestHeader('Connection','Keep-Alive')
$oHTTP.Send()
;MsgBox(0,"",$oHTTP.GetResponseHeader("Location"))
;~ MsgBox(0,"",$oHTTP.GetallResponseHeaders)
;MsgBox(0,"",$oHTTP.ResponseBody)
$urlHead = $oHTTP.GetResponseHeader("Location")
;MsgBox(0,0,$urlHead)
$urlBody = $oHTTP.ResponseBody
MsgBox(0,0,BinaryToString($urlBody))
Return $urlHead
EndFunc ;==>_Get1
Func _spl($a);将完整http://www.nature.com/nature7/sow.pdf中的一级域名提取出来。用于反复get跳转url直至最终得出最后的url用。
$b = StringSplit($a, '//', 1)
$c = StringSplit($b[2], '/', 1)
$b1 = $b[1]
;MsgBox(0,0,$b1)
$c1 = $c[1]
$left = $b1 & "//" & $c1
Return $left
EndFunc ;==>_spl
MsgBox(0,0,BinaryToString($urlBody))
这个为什么显示不出来网页可读的文字呢? |