-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathusort.ado
executable file
·235 lines (229 loc) · 12.1 KB
/
usort.ado
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
*! version 1.1.3 31oct2024 I I Bolotov
program def usort, sclass byable(onecall)
version 14
/*
This program is a byable sort command, which allows a) custom first and
last substrings, including system (.) and all remaining missing values,
b) gsort-like syntax for the ascending and descending order, as well as
c) conditional [if] and range [in] sorting. The program is built around
the Stata sort command and adds the data-sorted flag (sorted by) to the
dataset if all rows are selected and applies Mata _collate() otherwise.
Sorting large datasets might be taxing on machine memory or disk space.
Author: Ilya Bolotov, MBA Ph.D.
Date: 07 October 2024
*/
// syntax
qui desc
// check for the number of observations and variable limits
if ! _N exit 0
if c(maxvar) - r(k) < 2 error 900
****
tempfile tmpf
tempvar byid n p s
// sort the entire dataset at once or by groups and add the data-sorted flag
if ! _by() _sort `0'
else { /* Use preserve, keep, and append to prepare the final dataset. */
egen `byid' = group(`_byvars'), m autotype // use .-.z/"" as values
qui sum `byid'
forv i = `=r(min)'/`=r(max)' {
preserve
qui keep if `byid' == `i'
`=cond("`_byrc0'"!="","cap","")' _sort `0' // (ignore) group errors
qui save `tmpf', replace
restore
qui drop if `byid' == `i'
qui append using `tmpf', force
} /* Do sort on the by- and sortvars to set the data-sorted flag. */
qui drop `byid'
}
if "`_byvars'`s(varlist)'" != "" {
/* Save each sortvar into a string (`svl`) or numeric (`nvl`) macro. */
foreach var of varl `_byvars' `s(varlist)' {
cap conf str var `var'
if ! _rc loc svl "`svl' `var'"
else loc nvl "`nvl' `var'"
}
/* Preserving the original string and numeric values of the sortvars in
the matrices `s' and `n' in Mata, replace them with `p', perform the
regular Stata `sort` (i.e., jumbling and collation), and replace the
`p'-s with the sortvar values collated on the permutation vector. */
mata: `p' = 1:: st_nobs(); ///
mata: if ("`svl'" != "") `s' = st_sdata(., tokens("`svl'")); ///
st_sstore(., tokens("`svl'"), ///
strofreal( `p' # J(1,ustrwordcount("`svl'"),1)));;
mata: if ("`nvl'" != "") `n' = st_data(., tokens("`nvl'")); ///
st_store(., tokens("`nvl'"), ///
( `p' # J(1,ustrwordcount("`nvl'"),1)));;
sort `_byvars' `s(varlist)' // sort and add the flag
if ("`svl'" != "") {
mata: st_sstore(., tokens("`svl'"), `s')
mata: mata drop `s' // minimize memory usage
}
if ("`nvl'" != "") {
mata: st_store(., tokens("`nvl'"), `n')
mata: mata drop `n' // minimize memory usage
}
sret loc varlist "" // drop the sclass macro
}
end
program def _sort, sclass
// syntax
syntax ///
anything [if] [in] [, ///
First(string asis) Last(string asis) ignorec MFirst MLast ignorem ///
LOCale(string) st(integer -1) case(integer -1) cslv(integer -1) ///
norm(integer -1) num(integer -1) alt(integer -1) fr(integer -1) ///
format(string) codepoint(integer 129769) * ///
]
// adjust and preprocess options
if ustrregexm("`anything'", "[+-]\s+") error 100
loc anything = subinstr("`anything'", "+", "", .) /* strip plus signs */
foreach s in `anything' { /* expand wildcards */
loc sign = cond(ustrregexm("`s'", "^\s*-\s*"), "-", "")
qui ds `= ustrregexrf("`s'", "^\s*-\s*", "", . )'
mata: st_local("signlist", st_local("signlist") + " " + ///
invtokens( "`sign'" :+ tokens(st_global("r(varlist)")) ))
mata: st_local("varlist", st_local("varlist" ) + " " + ///
st_global("r(varlist)"))
mata: st_global("s(varlist)", ///
st_local("varlist"))
}
loc anything "`signlist'"
if ("`mfirst'" != "" ) & "`ignorem'" == "" ///
mata: st_local("first",invtokens("." :+ (tokens( ///
/* sort .-.z first */ c("alpha"))) ) + " .")
if ("`mlast'" != "" | `"`first'`last'"' == "") & "`ignorem'" == "" ///
mata: st_local( "last",invtokens("." :+ (tokens(strreverse( ///
/* sort .-.z last */ c("alpha"))))) + " .")
loc format = cond("`format'" != "", "`format'", "%32.16f")
conf form `format'
loc locale = cond("`locale'" != "", "`locale'", ///
c(locale_functions) )
****
tempvar select
tempname n p s
// obtain the permutation vector `p' in Mata from sorting the sortvar matrix
/* Generate a selectvar to use in Mata st_sdata()/st_sstore() functions. */
g byte `select' = 0
qui replace `select' = 1 `if' `in' // all rows or [if] [in]
preserve
foreach var of varl `varlist' {
/* Since non-numeric string values cannot be 'destringed', the sortvars
type must be str#/strL to allow sorting them as a single matrix with
the help of Mata `sort()` function. The precision of sorting numeric
values is set by the %fmt (default or user-provided value) specified
in the `format()` option. */
cap conf str var `var'
sca `n' = cond(_rc, 1, 0) // flag numeric sortvars
qui tostring `var', replace force format(`format')
/* Save the maximum string length and the length of the integer part of
each number into the macros `s` and `n`. */
mata: st_numscalar("`s'", max(strlen( st_sdata(., "`var'")))); ///
if (st_numscalar( " `n'" )) ///
st_numscalar("`n'", max(strlen( ustrregexrf( ///
st_sdata(., "`var'"), "^(\d+)[.,]\d+$", "$1"))));;
/* Equate string missing values "" and the 'tostringed' sysmiss ".". */
qui replace `var' = "." if mi(`var' )
/* To ensure that substrings from the `first()` option are sorted first
in the specified order, they are replaced in each sortvar with " #",
where " " is a string of whitespaces, the Unicode character from the
top of the UTF-8 table, with a length of `s' = max(strlen(sortvar)).
This action is not performed for already 'tostringed' missing values
(., .a, ..., .z) if the `ignorem` option is specified. */
if `"`first'"' != "" {
loc f = lower(ustrregexrf(`"`first'"', ///
".+,\s*([ustr]*regex[m]*|[ustr]*pos)$", "$1"))
if `"`f'"' == lower(`"`first'"' ) loc f "strmatch"
else if ustrregexm("`f'", "regex") loc f "ustrregexm"
else if ustrregexm("`f'", "[r]pos") loc f "ustrrpos"
else if ustrregexm("`f'", "pos") loc f "ustrpos"
if "`ignorec'" != "" loc t "ustrlower"
loc wrds = `t'(ustrregexrf(`"`first'"', ///
",\s*([ustr]*regex[m]*|[ustr]*pos)$", ""))
forv i = 1(1) `: word count `wrds'' {
loc w : word `i' of `wrds'
mata: st_local("i", "0" * ///
(max(strlen(tokens(st_local("wrds")))) - strlen("`i'" )) + ///
/* natural sorting requires leading zeros */ "`i'" )
qui replace `var' = " " * `s' + ///
`"`i'"' ///
if cond("`ignorem'" == "", 1, ///
! ustrregexm(`var', `"^[.a-z]{0,2}$"')) & ///
cond("`f'" != "strmatch" , `f'(`t'(`var'), `"`w'"'), ///
`t'(`var') == `"`w'"')
}
}
/* To ensure that substrings from the `last()` option are sorted last
in the specified order, they are replaced in each sortvar with "©#",
where "©" is a string of selected Unicode characters from the bottom
of the UTF-8 table, a code point of which (default or user-provided)
is specified in the `codepoint()` option, also with a length of `s'.
This action is not performed for already 'tostringed' missing values
(., .a, ..., .z) if the `ignorem` option is specified. */
if `"`last'"' != "" {
loc f = lower(ustrregexrf( `"`last'"', ///
".+,\s*([ustr]*regex[m]*|[ustr]*pos)$", "$1"))
if `"`f'"' == lower( `"`last'"' ) loc f "strmatch"
else if ustrregexm("`f'", "regex") loc f "ustrregexm"
else if ustrregexm("`f'", "[r]pos") loc f "ustrrpos"
else if ustrregexm("`f'", "pos") loc f "ustrpos"
if "`ignorec'" != "" loc t "ustrlower"
loc wrds = `t'(ustrregexrf( `"`last'"', ///
",\s*([ustr]*regex[m]*|[ustr]*pos)$", ""))
forv i = `: word count `wrds''(-1)1 {
loc w : word `=`: word count `wrds'' - `i' + 1' of `wrds'
mata: st_local("i", "0" * ///
(max(strlen(tokens(st_local("wrds")))) - strlen("`i'" )) + ///
/* natural sorting requires leading zeros */ "`i'" )
qui replace `var' = uchar(`codepoint') * `s' + ///
`"`i'"' ///
if cond("`ignorem'" == "", 1, ///
! ustrregexm(`var', `"^[.a-z]{0,2}$"')) & ///
cond("`f'" != "strmatch" , `f'(`t'(`var'), `"`w'"'), ///
`t'(`var') == `"`w'"')
}
}
/* To ensure proper numerical order, natural sorting requires ancillary
leading zeros in the integer part of each 'tostringed' number. */
if `n' qui replace `var' = "0" * (`n' - strlen(ustrregexrf( `var', ///
"^(\d+)[.,]\d+$", "$1"))) + `var'
/* Transform all sortvars into null-terminated byte arrays based on the
specified locale and any of the additional collation options. */
if "`st'`case'`cslv'`norm'`num'`alt'`fr'" == "" ///
qui replace `var' = ustrsortkey(`var', "`locale'" )
else qui replace `var' = ustrsortkeyex(`var', "`locale'", `st', ///
`case', `cslv', `norm', ///
`num', `alt', `fr' )
}
/* The permutation vector is obtained from an ancillary column added to the
matrix returned by Mata st_data(., 'sortvars', 'selectvar'), the rows of
which are sorted on columns 2 through the number of sortvars + 1, with a
negative index indicating descending order in a sortfvar. */
qui sum `select' // r(sum) = number of 1s
mata: `p' = strtoreal(sort((strofreal(1::st_numscalar("r(sum)")), ///
st_sdata(., tokens("`varlist'"), "`select'")), ///
(2..(cols(tokens("`varlist'")) + 1)) :* ///
strtoreal(tokens(ustrregexra(ustrregexra("`anything'", ///
"(^|\s+)\w+", " 1"), "(^|\s+)-\w+", " -1"))) ///
)[.,1])
// collate for all rows with or a subset without adding the data-sorted flag
restore
/* Collate the rows of all variables in the dataset, selected with the help
of the selectvar, on the permutation vector */
foreach var of varl * {
cap conf str var `var'
if ! _rc {
mata: _collate((`s'= st_sdata(., "`var'", "`select'")), `p')
mata: st_sstore(., "`var'", "`select'", `s')
mata: mata drop `s' // minimize memory usage
}
else {
mata: _collate((`n'= st_data(., "`var'", "`select'")), `p')
mata: st_store(., "`var'", "`select'", `n')
mata: mata drop `n' // minimize memory usage
}
}
// set a data-have-changed flag if `p' results in collation of variable rows
mata: if (`p' != sort(`p', 1)) st_updata(1);;
mata: mata drop `p' // minimize memory usage
end