正则匹配的问题(已解决)
本帖最后由 gto250 于 2012-6-2 20:38 编辑先附源码
#Include <Array.au3>
#include <ButtonConstants.au3>
#include <EditConstants.au3>
#include <GUIConstantsEx.au3>
#include <WindowsConstants.au3>
$Form1 = GUICreate("Form1", 968, 304, 342, 220)
$Edit1 = GUICtrlCreateEdit("", 8, 13, 233, 169)
GUICtrlSetData(-1, StringFormat("<Row ss:AutoFitHeight="&Chr(34)&"0"&Chr(34)&" ss:Height="&Chr(34)&"21.9375"&Chr(34)&" ss:StyleID="&Chr(34)&"s30"&Chr(34)&">\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&"><Data ss:Type="&Chr(34)&"Number"&Chr(34)&">1</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&"><Data ss:Type="&Chr(34)&"Number"&Chr(34)&">500033659</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">电杆</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">锥形水泥杆,非预应力,整根杆,10m,190mm,I</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&"><Data ss:Type="&Chr(34)&"Number"&Chr(34)&">2</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">基</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">Ф190*10非</Data></Cell>\r\n </Row> "))
$Edit2 = GUICtrlCreateEdit("", 248, 13, 233, 169)
GUICtrlSetData(-1, StringFormat("<Row ss:AutoFitHeight="&Chr(34)&"0"&Chr(34)&" ss:Height="&Chr(34)&"18.75"&Chr(34)&">\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&" ss:Formula="&Chr(34)&"=ROW()-2"&Chr(34)&"><Data ss:Type="&Chr(34)&"Number"&Chr(34)&">1</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&"><Data ss:Type="&Chr(34)&"Number"&Chr(34)&">500034429</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">横担</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">L6*60*1500</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&"><Data ss:Type="&Chr(34)&"Number"&Chr(34)&">6</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">块</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">线路角铁横担,∠60×6,1500mm,不计孔距,中间</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&"></Data></Cell>\r\n </Row>"))
$Edit3 = GUICtrlCreateEdit("", 489, 13, 233, 169)
GUICtrlSetData(-1, StringFormat(" <Row ss:AutoFitHeight="&Chr(34)&"0"&Chr(34)&" ss:Height="&Chr(34)&"21.9375"&Chr(34)&" ss:StyleID="&Chr(34)&"s30"&Chr(34)&">\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&"><Data ss:Type="&Chr(34)&"Number"&Chr(34)&">1</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&"><Data ss:Type="&Chr(34)&"Number"&Chr(34)&">500033659</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">电杆</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">锥形水泥杆,非预应力,整根杆,10m,190mm,I</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&"><Data ss:Type="&Chr(34)&"Number"&Chr(34)&">2</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">基</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s29"&Chr(34)&">\r\n </Row>"))
$Edit4 = GUICtrlCreateEdit("", 729, 13, 233, 169)
GUICtrlSetData(-1, StringFormat(" <Row ss:AutoFitHeight="&Chr(34)&"0"&Chr(34)&" ss:Height="&Chr(34)&"18.75"&Chr(34)&">\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&" ss:Formula="&Chr(34)&"=ROW()-2"&Chr(34)&"><Data ss:Type="&Chr(34)&"Number"&Chr(34)&">1</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&"><Data ss:Type="&Chr(34)&"Number"&Chr(34)&">500034429</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">横担</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">L6*60*1500</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&"><Data ss:Type="&Chr(34)&"Number"&Chr(34)&">6</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">块</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&"><Data ss:Type="&Chr(34)&"String"&Chr(34)&">线路角铁横担,∠60×6,1500mm,不计孔距,中间</Data></Cell>\r\n <Cell ss:StyleID="&Chr(34)&"s24"&Chr(34)&">\r\n </Row>"))
$Button1 = GUICtrlCreateButton("(1)匹配1和2", 24, 200, 75, 25)
$Button2 = GUICtrlCreateButton("(2)匹配1和2", 118, 202, 75, 25)
$Button3 = GUICtrlCreateButton("Button1", 221, 203, 75, 25)
$Button4 = GUICtrlCreateButton("Button1", 312, 203, 75, 25)
GUISetState(@SW_SHOW)
While 1
$nMsg = GUIGetMsg()
Switch $nMsg
Case $GUI_EVENT_CLOSE
Exit
Case $Button1
$c=str_arr(GUICtrlRead($Edit1))
_ArrayDisplay($c)
Case $Button2
$c=str_arr(GUICtrlRead($Edit2))
_ArrayDisplay($c)
EndSwitch
WEnd
Func str_arr($str)
Local $arr
$arr="序号 物料代码 材料名称 规格型号 数量 单位 备注"
$reg_str_1="<Row.*>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*</Row>"
$reg_str_2="<Row.*>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*<Cell.*><Data.*>(.*)</Data></Cell>\s*</Row>"
Local $array
Local $nOffset = 1
While 1
$array = StringRegExp($str, $reg_str_1, 1, $nOffset)
If @error=1 Then
$array = StringRegExp($str, $reg_str_2, 1, $nOffset)
EndIf
If @error = 0 Then
$nOffset = @extended
Else
ExitLoop
EndIf
If $array<>"序号" Then
$c=_ArrayToString($array,@TAB)
_ArrayAdd($arr,$c)
EndIf
WEnd
Return $arr
EndFunc
我想实现的是对一些文本文件的匹配工作,因为格式的问题,正则并不能完全起工作,文本的格式有以下四种
1、
<Row ss:AutoFitHeight="0" ss:Height="21.9375" ss:StyleID="s30">
<Cell ss:StyleID="s29"><Data ss:Type="Number">1</Data></Cell>
<Cell ss:StyleID="s29"><Data ss:Type="Number">500033659</Data></Cell>
<Cell ss:StyleID="s29"><Data ss:Type="String">电杆</Data></Cell>
<Cell ss:StyleID="s29"><Data ss:Type="String">锥形水泥杆,非预应力,整根杆,10m,190mm,I</Data></Cell>
<Cell ss:StyleID="s29"><Data ss:Type="Number">2</Data></Cell>
<Cell ss:StyleID="s29"><Data ss:Type="String">基</Data></Cell>
<Cell ss:StyleID="s29"><Data ss:Type="String">Ф190*10非</Data></Cell>
</Row>
2、
<Row ss:AutoFitHeight="0" ss:Height="18.75">
<Cell ss:StyleID="s24" ss:Formula="=ROW()-2"><Data ss:Type="Number">1</Data></Cell>
<Cell ss:StyleID="s24"><Data ss:Type="Number">500034429</Data></Cell>
<Cell ss:StyleID="s24"><Data ss:Type="String">横担</Data></Cell>
<Cell ss:StyleID="s24"><Data ss:Type="String">L6*60*1500</Data></Cell>
<Cell ss:StyleID="s24"><Data ss:Type="Number">6</Data></Cell>
<Cell ss:StyleID="s24"><Data ss:Type="String">块</Data></Cell>
<Cell ss:StyleID="s24"><Data ss:Type="String">线路角铁横担,∠60×6,1500mm,不计孔距,中间</Data></Cell>
<Cell ss:StyleID="s24"><Data ss:Type="String"></Data></Cell>
</Row>
3、
<Row ss:AutoFitHeight="0" ss:Height="21.9375" ss:StyleID="s30">
<Cell ss:StyleID="s29"><Data ss:Type="Number">1</Data></Cell>
<Cell ss:StyleID="s29"><Data ss:Type="Number">500033659</Data></Cell>
<Cell ss:StyleID="s29"><Data ss:Type="String">电杆</Data></Cell>
<Cell ss:StyleID="s29"><Data ss:Type="String">锥形水泥杆,非预应力,整根杆,10m,190mm,I</Data></Cell>
<Cell ss:StyleID="s29"><Data ss:Type="Number">2</Data></Cell>
<Cell ss:StyleID="s29"><Data ss:Type="String">基</Data></Cell>
<Cell ss:StyleID="s29">
</Row>
4、
<Row ss:AutoFitHeight="0" ss:Height="18.75">
<Cell ss:StyleID="s24" ss:Formula="=ROW()-2"><Data ss:Type="Number">1</Data></Cell>
<Cell ss:StyleID="s24"><Data ss:Type="Number">500034429</Data></Cell>
<Cell ss:StyleID="s24"><Data ss:Type="String">横担</Data></Cell>
<Cell ss:StyleID="s24"><Data ss:Type="String">L6*60*1500</Data></Cell>
<Cell ss:StyleID="s24"><Data ss:Type="Number">6</Data></Cell>
<Cell ss:StyleID="s24"><Data ss:Type="String">块</Data></Cell>
<Cell ss:StyleID="s24"><Data ss:Type="String">线路角铁横担,∠60×6,1500mm,不计孔距,中间</Data></Cell>
<Cell ss:StyleID="s24">
</Row>
当然,这是全部文本就截取的部分,1和3是一样的,只不过<Cell这个标签后面少了<Data的标签,2和4也同样。1和2是不一样的,一个有7行数据,一个有8行数据
我要做的是将<data>标签间的数据提取出来
我写了一个func,就是上面贴的代码,能做到1和2中的数据提取,但是做不了3和4中的数据也提取
想请教各位正则高手,如何写个正则可以对这个4种格式的数据提取,通用的正则
二楼的代码很好,但是我没有表述清楚,<row></row>标签是之内的数据一行数据,<cell></cell>是每个表格的数据,我这里给出的只是截取的一段数据,整个文本中是有很多<row></row>标签的
我只要提取<row></row>标签内有7个或者8个<cell></cell>标签内的<Data></Data>标签中的数据,(因为其他有些<row>标签内是小于7个<cell>标签或者大于8个<cell>的)从我上面给出的4个格式中可以看出,3和4文本中最后一个<cell>标签是不完整的,但是还是符合在<row>标签内有7个或者8个<cell>标签
发现自己说的很啰嗦,不知道大家看懂没!
1、我要找到文本中有7个或者8个<cell>标签的<row>,然后提取里面的全部<Data>标签内的数据
2、能做到以上4种格式的通用正则形式
回复 1# gto250
老大是不是考虑得太复杂了?
我估摸着, 按自己的思路, 续着编写, 以 LISTVIEW 方式显示数据, 不知合适不?
While 1
$nMsg = GUIGetMsg()
Switch $nMsg
Case $GUI_EVENT_CLOSE
Exit
Case $Button1
$c = str_arr(GUICtrlRead($Edit1))
If Not @error Then _ListView($c)
Case $Button2
$c = str_arr(GUICtrlRead($Edit2))
If Not @error Then _ListView($c)
Case $Button3
$c = str_arr(GUICtrlRead($Edit3))
If Not @error Then _ListView($c)
Case $Button4
$c = str_arr(GUICtrlRead($Edit4))
If Not @error Then _ListView($c)
EndSwitch
WEnd
Func str_arr($str)
Local $arr = StringRegExp($str, '>([^<]+)</Data>', 3)
If @error Then Return SetError(1)
Return $arr
EndFunc ;==>str_arr
Func _ListView($aArray)
Local $data = "序号|物料代码|材料名称|规格型号|数量|单位|备注"
Local $form2 = GUICreate('Listview result', 500, 300)
$listview = GUICtrlCreateListView($data, 8, 8, 480, 280)
$data = $aArray
For $i = 1 To UBound($aArray) - 1
$data &= '|' & $aArray[$i]
Next
GUICtrlCreateListViewItem($data, $listview)
GUISetState()
While 1
$nMsg = GUIGetMsg()
Switch $nMsg
Case $GUI_EVENT_CLOSE
ExitLoop
EndSwitch
WEnd
GUIDelete($form2)
EndFunc ;==>_ListView 回复 2# user3000
谢谢你的回复,可能我没有表述清楚!我再修改一下提问 本帖最后由 user3000 于 2012-6-1 23:31 编辑
回复 3# gto250
是我没仔细看, 呵呵, 文字加代码'太多'了!
那么可以考虑把'不合格'的数据先转化为'合格'的数据再作处理不?
比如:
$str = StringRegExpReplace(GUICtrlRead($Edit4), '(?<=[\n\r])(\h*<Cell[^>]+>)(?=[\r\n])', '\1<Data ss:Type="String"></Data></Cell>')
MsgBox(0, '', $str & @CRLF & @CRLF & GUICtrlRead($Edit2))
没注意看, 原来的代码有误, 现在已修正! 看了半天,也不知道是不是这意思:
表达式:<Row.+\s+.+?>([^<]*)</Data></Cell>\s+.+?>([^<]*)</Data></Cell>\s+.+?>([^<]*)</Data></Cell>\s+.+?>([^<]*)</Data></Cell>\s+.+?>([^<]*)</Data></Cell>\s+.+?>([^<]*)</Data></Cell>\s+.+?>([^<\v]*)(?:</Data></Cell>)?\s+(?:.+?>([^<\v]*)(?:</Data></Cell>)?\s+)?</Row>测试代码:#include <Array.au3>
Local $Str1 = _
'<Row ss:AutoFitHeight="0" ss:Height="21.9375" ss:StyleID="s30">' & @CRLF & _
' <Cell ss:StyleID="s29"><Data ss:Type="Number">1</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s29"><Data ss:Type="Number">500033659</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s29"><Data ss:Type="String">电杆</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s29"><Data ss:Type="String">锥形水泥杆,非预应力,整根杆,10m,190mm,I</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s29"><Data ss:Type="Number">2</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s29"><Data ss:Type="String">基</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s29"><Data ss:Type="String">Ф190*10非</Data></Cell>' & @CRLF & _
' </Row>'
Local $Str2 = _
'<Row ss:AutoFitHeight="0" ss:Height="18.75">' & @CRLF & _
' <Cell ss:StyleID="s24" ss:Formula="=ROW()-2"><Data ss:Type="Number">1</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24"><Data ss:Type="Number">500034429</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24"><Data ss:Type="String">横担</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24"><Data ss:Type="String">L6*60*1500</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24"><Data ss:Type="Number">6</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24"><Data ss:Type="String">块</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24"><Data ss:Type="String">线路角铁横担,∠60×6,1500mm,不计孔距,中间</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24"><Data ss:Type="String"></Data></Cell>' & @CRLF & _
' </Row>'
Local $Str3 = _
'<Row ss:AutoFitHeight="0" ss:Height="21.9375" ss:StyleID="s30">' & @CRLF & _
' <Cell ss:StyleID="s29"><Data ss:Type="Number">1</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s29"><Data ss:Type="Number">500033659</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s29"><Data ss:Type="String">电杆</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s29"><Data ss:Type="String">锥形水泥杆,非预应力,整根杆,10m,190mm,I</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s29"><Data ss:Type="Number">2</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s29"><Data ss:Type="String">基</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s29">' & @CRLF & _
' </Row>'
Local $Str4 = _
'<Row ss:AutoFitHeight="0" ss:Height="18.75">' & @CRLF & _
' <Cell ss:StyleID="s24" ss:Formula="=ROW()-2"><Data ss:Type="Number">1</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24"><Data ss:Type="Number">500034429</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24"><Data ss:Type="String">横担</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24"><Data ss:Type="String">L6*60*1500</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24"><Data ss:Type="Number">6</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24"><Data ss:Type="String">块</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24"><Data ss:Type="String">线路角铁横担,∠60×6,1500mm,不计孔距,中间</Data></Cell>' & @CRLF & _
' <Cell ss:StyleID="s24">' & @CRLF & _
' </Row>'
_GetData($Str1, 1)
_GetData($Str2, 2)
_GetData($Str3, 3)
_GetData($Str4, 4)
Func _GetData($str, $i)
Local $Test = StringRegExp($str, '<Row.+\s+.+?>([^<]*)</Data></Cell>\s+.+?>([^<]*)</Data></Cell>\s+.+?>([^<]*)</Data></Cell>\s+.+?>([^<]*)</Data></Cell>\s+.+?>([^<]*)</Data></Cell>\s+.+?>([^<]*)</Data></Cell>\s+.+?>([^<\v]*)(?:</Data></Cell>)?\s+(?:.+?>([^<\v]*)(?:</Data></Cell>)?\s+)?</Row>', 1)
_ArrayDisplay($Test, $i)
EndFunc ;==>_GetData
本帖最后由 Duvet 于 2012-6-2 00:39 编辑
重寫了str_arr函數
Func str_arr(ByRef $Text)
Local $Result = [["序号", "物料代码", "材料名称", "规格型号", "数量", "单位", "备注", ""]]
Local $ResultTotal = 1
Local $Regex = ["(?i)<Row(?>(?>[^<]|<(?!/Row|data))++<data(?>""[^""]*""|'[^']*'|[^>])++>[^<>]*<){7,8}(?>[^<]|<(?!/Row|data))++</Row>", "(?i)>([^<>]*)</data"]
Local $tArray = StringRegExp($Text, $Regex, 3)
If @error Then Return SetError(-1, 0, "")
For $i = 0 To UBound($tArray) - 1
Local $tResult = StringRegExp($tArray[$i], $Regex, 3)
ReDim $Result[$ResultTotal + 1]
For $j = 0 To UBound($tResult) - 1
$Result[$ResultTotal][$j] = $tResult[$j]
Next
$ResultTotal += 1
Next
Return SetError(0, $ResultTotal - 1,$Result)
EndFunc
在<Row></Row>符合條件下(7至8個Data),第8個Data絕對是空值的話,改下這兩行
Local $Result = [["序号", "物料代码", "材料名称", "规格型号", "数量", "单位", "备注"]]
$Regex = "(?i)>([^<>]+)</data" Func str_arr($str)
$str=StringRegExpReplace($str,"(?s)\<\/Cell\>\s*(\<\/Row\>)","$1")
$str=StringReplace($str,"</Cell>",@TAB)
$str=StringReplace($str,"</Row>",@CRLF)
$str=StringRegExpReplace($str,"(?s)\<.*?\>","")
$arr=StringSplit($str,@tab)
Return $arr
EndFunc有点像数据库源文件整理,我倾向于用这种方法 谢谢所有以上的兄弟们,还是A版的代码最适合,需要处理的文本是我从EXCEL导出的xml表格中截取的,像6楼的兄弟代码,因为我并不知道所有处理的文本是否有空值,因此我只能用通用的正则来匹配。7楼的兄弟,如果我在<row>标签前加入其它的文字,也会被截取到。
不过都谢谢你们!
页:
[1]