-
Notifications
You must be signed in to change notification settings - Fork 0
/
catalogue.py
1375 lines (1170 loc) · 62.1 KB
/
catalogue.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import glob
import os
import warnings
from inspect import currentframe, getframeinfo
import numpy as np
import pandas as pd
from astropy.coordinates import SkyCoord
from astropy.io import fits as f
from astropy.io.votable import parse_single_table
from astropy.table import Table
from astropy.utils.exceptions import AstropyWarning
from functions import config2dic, flux_at_freq, SED, two_freq_power_law, axis_lim
# Ignore annoying astropy warnings and set my own obvious warning output
warnings.simplefilter('ignore', category=AstropyWarning)
cf = currentframe()
WARN = '\n\033[91mWARNING: \033[0m' + getframeinfo(cf).filename
class catalogue(object):
def __init__(self, filename, name, image=None, frequency=1400, search_rad=10.0,
finder=None, ra_col='ra', dec_col='dec', ra_fmt='deg', dec_fmt='deg',
flux_col='int_flux', flux_err_col='err_int_flux', peak_col='peak_flux',
peak_err_col='err_peak_flux', rms_val='local_rms', flux_unit='Jy',
use_peak=False, island_col='island', flag_col='flags', maj_col='a',
SNR=5.0, col_suffix='', sep='\t', basename=None, autoload=True,
verbose=False):
"""Initialise a catalogue object.
Arguments:
----------
filename : string
The path to a FITS, csv, xml or 'sep'-delimited catalogue.
name : string
A unique short-hand name for the catalogue (e.g. 'NVSS'). This
will act as the key for a number of dictionaries of the key catalogue
fields, including the positions and fluxes.
Keyword arguments:
------------------
image : radio_image
A radio image object used to initialise certain fields.
freq : float
The frequency of this image in MHz.
search_rad : float
The search radius in arcsec to use when cross-matching this catalogue.
finder : string
The source finder that produced this catalogue (if known)
['Aegean' | 'Selavy' | 'pybdsm']. This sets all the column names
(including rms) and flux units to their defaults for that source finder.
ra_col : string
The name of the RA column.
dec_col : string
The name of the DEC column.
ra_fmt : string
The format of the RA column, input to SkyCoord (e.g. 'deg' or 'hour').
dec_fmt : string
The format of the DEC column, input to SkyCoord.
flux_col : string
The name of the integrated flux column.
flux_err_col : string
The name of the integrated flux error column. Use None if this doesn't exist,
and 10% errors will be assumed.
peak_col : string
The name of the peak flux column (if any). Use None if this doesn't exist
and it won't be used.
peak_err_col : string
The name of the integrated flux error column. Use None if this doesn't exist,
and 10% errors will be assumed.
rms_val : string or float
The name of the local rms column, or a fixed value across the whole catalogue.
The units must be the same as the flux columns.
flux_unit : string
The (case-insensitive) units of all the flux columns ['Jy' | 'mJy' | 'uJy'].
use_peak : bool
Use the peak flux instead of the integrated flux.
island_col : string
The name of the island column (if any).
flag_col : string
The name of the flag column (if any).
maj_col : string
The name of the fitted major axis column (if any). This is assumed to be in arcsec.
SNR : float
The minimum signal-to-noise ratio of the input catalogue.
col_suffix : string
A suffix to add to the end of all column names (e.g. '_deep' for GLEAM).
sep : string
When reading in a delimited file, use this delimiter (not needed for csv file).
basename : string
The base of the name to use for all output catalogues. Use None to use the same
as paramater 'name'.
autoload : bool
Look for files already processed and load these if they exist.
verbose : bool
Verbose output.
See Also
--------
astropy.coordinates.SkyCoord
pandas.DataFrame"""
print("--------------------------")
print("| Reading {0} catalogue |".format(name))
print("--------------------------")
self.verbose = verbose
self.name = name
self.filename = filename.split('/')[-1]
self.image = image
self.SNR = SNR
# Set basename
self.basename = basename
if self.basename is None:
self.basename = name
# Set names of all output catalogues
self.cutout_name = "{0}_cutout.csv".format(self.basename)
self.filtered_name = "{0}_filtered.csv".format(self.basename)
self.si_name = ''
si_files = glob.glob("{0}*_si.csv".format(self.basename))
if len(si_files) > 0:
# This is a guess, but is updated later if doesn't exist
self.si_name = si_files[0]
# Look for files already processed in order of preference
fileFound = False
if autoload:
fileFound = True
if os.path.exists(self.si_name):
filename = self.si_name
print("Spectral index catalogue already exists. Using\
file '{0}'".format(filename))
elif os.path.exists(self.filtered_name):
filename = self.filtered_name
print("Filtered catalogue already exists. Using file '{0}'".format(filename))
elif os.path.exists(self.cutout_name):
filename = self.cutout_name
print("Cutout catalogue already exists. Using file '{0}'".format(filename))
else:
fileFound = False
# Convert file to pandas data frame
self.df = self.cat2df(filename, sep, verbose=True)
# Read frequency and search radius from image object if exists,
# otherwise from input paramaters
if self.image is not None:
self.frequency = self.image.freq
self.search_rad = 3*self.image.posErr
else:
self.frequency = frequency
self.search_rad = search_rad
# To ensure unique column names, append catalogue name to beginning of
# column names when not already done
if not fileFound:
self.df.columns = '{0}_'.format(self.name) + self.df.columns
self.col_suffix = col_suffix
if finder is None:
self.finder = None
else:
self.finder = finder.lower()
self.knownFinder = (self.finder in ['aegean', 'selavy', 'pybdsm'])
if self.finder is not None and not self.knownFinder:
warnings.warn_explicit("Unrecognised source finder: {0}. Use\
'Aegean', 'Selavy' or 'pybdsm'\n".format(finder), UserWarning,
WARN, cf.f_lineno)
if self.finder is not None and self.finder == 'selavy':
if self.verbose:
print('Using default configuration for Selavy.')
# Set default column names for Selavy, appending catalogue name to beginning
self.flux_col = self.unique_col_name('flux_int')
self.flux_err_col = self.unique_col_name('flux_int_err')
self.peak_col = self.unique_col_name('flux_peak')
self.peak_err_col = self.unique_col_name('flux_peak_err')
self.rms_val = self.unique_col_name('rms_image')
self.island_col = self.unique_col_name('island_id')
self.flag_col = self.unique_col_name('fit_is_estimate')
self.maj_col = self.unique_col_name('maj_axis')
self.ra_col = self.unique_col_name('ra_deg_cont')
self.dec_col = self.unique_col_name('dec_deg_cont')
self.ra_fmt = 'deg'
self.dec_fmt = 'deg'
self.flux_unit = 'mjy'
self.si_col = self.unique_col_name('spectral_index')
elif self.finder is not None and self.finder == 'pybdsm':
if self.verbose:
print('Using default configuration for pybdsm.')
# Set default column names for pybdsm, appending catalogue name to beginning
self.flux_col = self.unique_col_name('Total_flux')
self.flux_err_col = self.unique_col_name('E_Total_flux')
self.peak_col = self.unique_col_name('Peak_flux')
self.peak_err_col = self.unique_col_name('E_Peak_flux')
self.rms_val = self.unique_col_name('Isl_rms')
self.island_col = self.unique_col_name('Isl_id')
self.flag_col = None
self.maj_col = self.unique_col_name('DC_Maj')
self.ra_col = self.unique_col_name('RA')
self.dec_col = self.unique_col_name('DEC')
self.ra_fmt = 'deg'
self.dec_fmt = 'deg'
self.flux_unit = 'jy'
self.si_col = None
else:
if self.finder is not None and self.finder == 'aegean':
if self.verbose:
print('Using default configuration for Aegean.')
# Append catalogue name to beginning of all columns
self.flux_col = self.unique_col_name(flux_col)
self.flux_err_col = self.unique_col_name(flux_err_col)
self.peak_col = self.unique_col_name(peak_col)
self.peak_err_col = self.unique_col_name(peak_err_col)
self.island_col = self.unique_col_name(island_col)
self.flag_col = self.unique_col_name(flag_col)
self.maj_col = self.unique_col_name(maj_col)
if type(rms_val) is str:
self.rms_val = self.unique_col_name(rms_val)
else:
self.rms_val = rms_val
# Specific fix for GLEAM catalogue
if finder is not None and finder.lower() == 'aegean':
self.col_suffix = ''
self.flux_unit = flux_unit
self.ra_col = self.unique_col_name(ra_col)
self.dec_col = self.unique_col_name(dec_col)
self.ra_fmt = ra_fmt
self.dec_fmt = dec_fmt
self.si_col = None
self.use_peak = use_peak
self.flux_unit = self.flux_unit.lower()
if self.flux_unit not in ('jy', 'mjy', 'ujy'):
warnings.warn_explicit("Unrecognised flux unit '{0}'. Use 'Jy', 'mJy' or\
'uJy'. Assuming 'Jy'\n".format(flux_unit), UserWarning, WARN, cf.f_lineno)
self.flux_unit = 'jy'
# Keep a running list of key values for all sources, as a dictionary with key 'name'
self.cat_list = [self.name]
self.freq = {self.name: self.frequency}
self.radius = {self.name: self.search_rad}
self.count = {self.name: len(self.df)}
self.coords = {}
self.ra = {}
self.dec = {}
self.flux = {}
self.flux_err = {}
self.rms = {}
self.sep = {}
self.dRAsec = {}
self.dRA = {}
self.dDEC = {}
self.si = {}
# Initialise all the key fields, including positions and fluxes.
self.set_key_fields(set_coords=False)
def unique_col_name(self, col):
"""Return a unique column name by appending the catalogue name to the
beginning. If column is None, return None.
Arguments:
----------
col : string
Column name.
Returns:
--------
col : string
Unique column name or None."""
if col is not None:
col = '{0}_{1}{2}'.format(self.name, col, self.col_suffix)
return col
def cat2df(self, filepath, sep, verbose=False):
"""Return a pandas dataframe of the provided catalogue.
If a '.fits' or '.csv' file isn't given, a file delimited by 'sep' will be assumed.
Arguments:
----------
filepath : string
The absolute path to the catalogue.
Keyword arguments:
------------------
sep : string
Delimiter for delimited files.
verbose : bool
Verbose output.
Returns:
--------
df : pandas.DataFrame
A pandas dataframe of the catalogue.
See Also
--------
pandas.DataFrame"""
if verbose:
print("Loading '{0}' catalogue into pandas.".format(filepath.split('/')[-1]))
# Convert from FITS or xml to dataframe
extn = filepath.split('.')[-1].lower()
if extn == 'fits' or extn == 'gz':
table = f.open(filepath)[1]
df = Table(data=table.data).to_pandas()
elif extn == 'xml': # assumed to come from Selavy
table = parse_single_table(filepath).to_table(use_names_over_ids=True)
df = table.to_pandas()
# Otherwise, load straight into pandas as csv or 'sep' delimited file
elif extn == 'csv':
df = pd.read_csv(filepath)
else:
df = pd.read_table(filepath, sep=sep)
if verbose:
print("Catalogue contains {0} sources.".format(len(df)))
return df
def set_specs(self, img):
"""Set the key fields of this catalogue using an input image. This must be
done before the catalogue is filtered.
img : radio_image
A radio_image object corresponding to this catalogue, which is used
to calculate various quantities."""
if img is not None:
# Get area of non-nan pixels of image
self.area = img.area
self.ra_bounds = img.ra_bounds
self.dec_bounds = img.dec_bounds
# Get dynamic range between image and rms map and sum of image
# flux vs. total catalogue flux
rms_map = f.open(img.rms_map)[0]
img_data = img.fits.data
if self.finder == 'aegean':
img_data = img_data[0][0]
self.img_peak = np.max(img_data[~np.isnan(img_data)])
self.rms_bounds = rms_map.data > 0
self.img_rms = int(np.median(rms_map.data[self.rms_bounds])*1e6) # uJy
self.img_peak_bounds = np.max(img_data[self.rms_bounds])
self.img_peak_pos = np.where(img_data == self.img_peak_bounds)
self.img_peak_rms = rms_map.data[self.img_peak_pos][0]
self.dynamic_range = self.img_peak_bounds/self.img_peak_rms
self.img_flux = np.sum(img_data[~np.isnan(img_data)]) / \
(1.133 * ((img.bmaj * img.bmin) / (img.raPS * img.decPS)))
# Get the approximate area from catalogue
else:
if self.name not in list(self.coords.keys()):
self.set_key_fields()
self.ra_bounds = (max(self.ra[self.name]), min(self.ra[self.name]))
dRA = (self.ra_bounds[0] - self.ra_bounds[1])*np.cos(
np.deg2rad(np.mean(self.dec[self.name])))
self.dec_bounds = (max(self.dec[self.name]), min(self.dec[self.name]))
dDEC = abs(self.dec_bounds[0] - self.dec_bounds[1])
self.area = dRA*dDEC
self.img_peak = np.max(self.flux[self.name])
self.img_rms = int(np.median(self.rms[self.name])*1e6) # uJy
self.dynamic_range = self.img_peak/self.img_rms
self.img_flux = np.nan
self.blends = len(np.where(self.df[self.island_col].value_counts() > 1)[0])
self.cat_flux = np.sum(self.flux[self.name])
# Get median spectral index if column exists
if self.name in list(self.si.keys()):
self.med_si = np.median(self.si[self.name])
else:
self.med_si = -99
if self.verbose:
print("Sum of flux in image is {0:.3f} Jy and sum of all fitted\
gaussians is {1:.3f} Jy.".format(self.img_flux, self.cat_flux))
print("Image peak is {0:.2f} Jy.".format(self.img_peak))
print("Dynamic range is {0:.0E}.".format(self.dynamic_range))
print("Number of multi-component islands is {0}.".format(self.blends))
if self.med_si != -99:
print("Median spectral index is {0:.2f}.".format(self.med_si))
# Store the initial length of the catalogue
self.initial_count = self.count[self.name]
# Derive the fraction of resolved sources as the fraction of sources
# with int flux > 3-sigma away from peak flux
if self.name in list(self.flux.keys()):
# Josh's method
self.uncertainty = np.sqrt(self.df[self.flux_err_col]**2 + self.df[
self.peak_err_col]**2)
if self.finder == 'selavy':
self.uncertainty += np.sqrt(self.df[self.rms_val]**2)
self.sigma = (self.df[self.flux_col] - self.df[self.peak_col]) / self.uncertainty
self.resolved = np.where(self.sigma > 3)[0]
self.resolved_frac = len(self.resolved) / len(self.df)
else:
self.resolved_frac = -1
def set_key_fields(self, indices=None, cat=None, set_coords=True):
"""Set the key fields, including the positions, frequency and fluxes.
This must be run each time the dataframe is updated. Each field is a
dictionary with key catalogue.name.
Keyword Arguments
-----------------
indices : list
A list of indices from this instance to subselect after removing rows.
If indices is None, all indices of this instance will be used.
cat : catalogue
Another catalogue object used to initialise the fields. If None is
provided, this instance will be used.
set_coords : bool
Create a list of SkyCoord objects for every source. As this is a
time-consuming task, it is only recommended after a cutout has been applied.
This only applies when cat is None.
See Also
--------
astropy.coordinates.SkyCoord
pandas.Series"""
coords_set = False
# After cross-match, add new coordinates, positions and fluxes, etc to dictionaries
if cat is not None:
# Set easy names for columns
prefix = '{0}_{1}_'.format(cat.name, self.name)
sep = prefix + 'sep'
dRAsec = prefix + 'dRAsec'
dRA = prefix + 'dRA'
dDEC = prefix + 'dDEC'
self.cat_list.append(cat.name)
self.count[cat.name] = len(np.where(~np.isnan(cat.df[sep]))[0])
self.freq[cat.name] = cat.freq[cat.name]
self.radius[cat.name] = cat.radius[cat.name]
self.ra[cat.name] = cat.ra[cat.name]
self.dec[cat.name] = cat.dec[cat.name]
# Compute the positional offsets
self.df[dRAsec] = (self.ra[self.name] -
self.ra[cat.name])*3600 # in seconds
self.df[dRA] = self.df[dRAsec]*np.cos(np.deg2rad((self.dec[self.name] +
self.dec[cat.name])/2))
self.df[dDEC] = (self.dec[self.name] - self.dec[cat.name])*3600
# Store in dictionaries
self.sep[cat.name] = cat.df[sep]
self.dRAsec[cat.name] = self.df[dRAsec]
self.dRA[cat.name] = self.df[dRA]
self.dDEC[cat.name] = self.df[dDEC]
if cat.name in list(cat.flux.keys()):
self.flux[cat.name] = cat.flux[cat.name]
self.flux_err[cat.name] = cat.flux_err[cat.name]
self.rms[cat.name] = cat.rms[cat.name]
# Write flux ratio if frequencies within 1%
freq_ratio = np.abs(cat.freq[cat.name]/self.freq[self.name]-1)
if self.name in list(self.flux.keys()) and freq_ratio < 0.01:
self.df[prefix + 'flux_ratio'] = self.flux[self.name]/self.flux[cat.name]
if cat.si_col is not None:
self.si[cat.name] = cat.si[cat.name]
# Otherwise initialise or update dictionary for this instance
else:
if set_coords or (indices is not None and len(self.coords) == 0):
# Initialise SkyCoord object for all sky positions and create numpy
# array for RA and DEC in degrees
self.coords[self.name] = SkyCoord(ra=self.df[self.ra_col],
dec=self.df[self.dec_col],
unit='{0},{1}'.format(self.ra_fmt,
self.dec_fmt))
self.ra[self.name] = self.coords[self.name].ra.deg
self.dec[self.name] = self.coords[self.name].dec.deg
coords_set = True
# Initialise fluxes
if indices is None and not (self.flux_col is None and self.peak_col is None):
# Create pandas series for peak or integrated flux and errors
if self.use_peak and self.peak_col is None:
warnings.warn_explicit("Can't use peak flux since peak column name not\
specified. Using integrated flux.\n", UserWarning, WARN, cf.f_lineno)
if self.use_peak and self.peak_col is not None:
self.flux[self.name] = self.df[self.peak_col].copy()
if self.peak_err_col is not None:
self.flux_err[self.name] = self.df[self.peak_err_col].copy()
else:
self.flux_err[self.name] = self.flux[self.name]*0.1 # 10% error
else:
self.flux[self.name] = self.df[self.flux_col].copy()
if self.flux_err_col is not None:
self.flux_err[self.name] = self.df[self.flux_err_col].copy()
else:
self.flux_err[self.name] = self.flux[self.name]*0.1 # 10% error
# Set 10% errors where error is <=0
if np.any(self.flux_err[self.name] <= 0):
i = np.where(self.flux_err[self.name] <= 0)[0]
self.flux_err[self.name][i] = self.flux[self.name][i]*0.1 # 10% error
# Set rms as pandas series or single value
if type(self.rms_val) is str:
self.rms[self.name] = self.df[self.rms_val].copy()
else:
self.rms[self.name] = self.rms_val
# Force Jy units
factor = 1
if self.flux_unit == 'mjy':
factor = 1e-3
elif self.flux_unit == 'ujy':
factor = 1e-6
self.flux[self.name] *= factor
self.flux_err[self.name] *= factor
self.rms[self.name] *= factor
if self.si_col is not None:
self.si[self.name] = self.df[self.si_col]
# Otherwise just update this instance
else:
if not coords_set:
self.coords[self.name] = self.coords[self.name][indices]
self.ra[self.name] = self.ra[self.name][indices]
self.dec[self.name] = self.dec[self.name][indices]
# Reset indices of pandas series
if self.name in self.flux:
self.flux[self.name] = self.flux[self.name][indices].reset_index(drop=True)
self.flux_err[self.name] = self.flux_err[self.name][indices].reset_index(
drop=True)
if type(self.rms_val) is str:
self.rms[self.name] = self.rms[self.name][indices].reset_index(drop=True)
# Only update these for cross-matched catalogues
if self.name in self.sep:
self.sep[self.name] = self.sep[self.name][indices].reset_index(drop=True)
self.dRAsec[self.name] = self.dRAsec[self.name][indices].reset_index(drop=True)
self.dRA[self.name] = self.dRA[self.name][indices].reset_index(drop=True)
self.dDEC[self.name] = self.dDEC[self.name][indices].reset_index(drop=True)
if self.name in self.si:
self.si[self.name] = self.si[self.name][indices].reset_index(drop=True)
# Reset indices and catalogue length after change has been made
self.df = self.df.reset_index(drop=True)
self.count[self.name] = len(self.df)
def overwrite_df(self, catalogue, step='this', set_coords=True, verbose=True):
"""Overwrite self.df with another file or DataFrame. All other fields are assumed to stay the same.
Arguments:
----------
catalogue : string or pandas.DataFrame
The filename of the catalogue (must be csv file), or a pandas data frame object.
If a dataframe is input, it is assumed to have come from a catalogue object.
Keyword arguments:
------------------
verbose : bool
Verbose output.
See Also:
---------
pandas.DataFrame"""
# Read from file if filename is provided
if type(catalogue) is str:
self.df = pd.read_csv(catalogue)
if verbose:
print("'{0}' already exists. Skipping {1} step and setting catalogue to this\
file.".format(catalogue, step))
print("{0} catalogue now contains {1} sources.".format(self.name, len(self.df)))
else:
self.df = catalogue
# Reset the key fields
self.set_key_fields(set_coords=set_coords)
def write_df(self, write, filename, verbose=True):
"""Write data frame to file.
Arguments:
----------
write : bool
Write the file.
filename : string
The file name.
Keyword Arguments:
------------------
verbose : bool
Verbose output."""
if write:
if verbose:
print("Writing to '{0}'.".format(filename))
self.df.to_csv(filename, index=False)
def cutout_box(self, ra, dec, fov=10, redo=False, write=True, verbose=True):
"""Cut out a box of the catalogue, updating the catalogue to only
contain sources within this box.
Input a central RA and DEC and FOV or four vertices.
Arguments:
----------
ra : float or tuple
The RA centre in degrees, or a tuple of two RA boundaries.
dec : float
The DEC centre in degrees, or a tuple of two DEC boundaries.
Keyword arguments:
------------------
fov : float
The field of view in degrees (i.e. fov*fov degrees). Only used when
RA & DEC are single values.
redo : bool
Cut out a box, even if a cutout file exists.
write : bool
Write the cutout catalogue to file.
verbose : bool
Verbose output."""
filename = self.cutout_name
# If column units aren't degrees, set sky coords and get RA/DEC in degrees
if self.ra_fmt != 'deg' or self.dec_fmt != 'deg':
if len(self.coords) == 0:
self.set_key_fields(set_coords=True)
RA = self.ra[self.name]
DEC = self.dec[self.name]
# Otherwise just use column values
else:
RA = self.df[self.ra_col]
DEC = self.df[self.dec_col]
if (type(ra) is tuple and (type(dec) is not tuple or len(ra) != 2)) or\
(type(dec) is tuple and (type(ra) is not tuple or len(dec) != 2)):
warnings.warn_explicit('RA and DEC must both be single value or a\
tuple with two indices. Cutout not applied.\n', UserWarning,
WARN, cf.f_lineno)
elif redo or not os.path.exists(filename):
if verbose:
if redo:
print("Re-doing cutout step.")
if write:
print("Overwriting '{0}'.".format(filename))
print("Cutting out sources from {0}.".format(self.name))
# Cut out all rows outside RA and DEC boundaries
if type(ra) is tuple:
ra_min, ra_max = axis_lim(ra, min), axis_lim(ra, max)
dec_min, dec_max = axis_lim(dec, min), axis_lim(dec, max)
if verbose:
print("Selecting sources with {0} <= RA <= {1} and {2} <= DEC <=\
{3}.".format(ra_min, ra_max, dec_min, dec_max))
self.df = self.df[(RA <= ra_max) & (RA >= ra_min) & (DEC <= dec_max)
& (DEC >= dec_min)]
# Cut out all rows outside the FOV
else:
if verbose:
print("Using a {0}x{0} degree box centred at {1} deg, {2} deg.".format(fov,
ra, dec))
self.df = self.df[(DEC <= dec + fov/2) & (DEC >= dec - fov/2) &
(RA <= ra + fov/2/np.cos(np.deg2rad(DEC))) &
(RA >= ra - fov/2/np.cos(np.deg2rad(DEC)))]
if verbose:
print('{0} {1} sources within this region.'.format(len(self.df), self.name))
# Drop the rejected rows, reset the key fields and write to file
self.set_key_fields(indices=self.df.index.tolist())
self.write_df(write, filename)
# If file exists, simply read in catalogue
else:
self.overwrite_df(filename, step='cutout', set_coords=False)
def filter_sources(self, flux_lim=0, SNR=0, ratio_frac=0, ratio_sigma=0,
reject_blends=False, psf_tol=0, resid_tol=0, flags=False,
file_suffix='', redo=False, write=True, verbose=False):
"""Reject problematic sources according to several criteria. This will
overwrite self.df.
Keyword arguments:
------------------
flux_lim : float
The flux density limit in Jy, below which sources will be rejected.
SNR : float
The S/N ratio limit (where N is the rms), below which sources will be
rejected. Use 0 to skip this step.
ratio_frac : float
The fraction given by the integrated flux divided by the peak flux, above
which, sources will be rejected. Use 0 to skip this step.
ratio_sigma : float
Reject sources based on the validation metric for resolved sources,
based on their int/peak, above this sigma value.
reject_blends : bool
For Aegean and Selavy only. Reject multi-component sources.
psf_tol : float
For Aegean and Selavy only. Reject sources with fitted major axis this many
times the psf major axis. Use 0 to skip this step.
resid_tol : float
For Aegean only. Reject sources with a std in the residual this many times
the rms. Use 0 to skip this step.
flags : bool
Reject sources with flags != 0. Only performed if flag_col set in constructor
of this instance.
file_suffix : string
Suffix to add to filename, for when doing several steps of filtering.
redo : bool
Perform the filtering, even if filtered file exists.
write : bool
Write the filtered catalogue to file.
verbose : bool
Verbose output."""
if file_suffix != '':
self.basename += file_suffix
filename = '{0}.csv'.format(self.basename)
else:
filename = self.filtered_name
if redo or not os.path.exists(filename):
if verbose:
if redo:
print("Re-doing filtering.")
if write:
print("Overwriting '{0}'.".format(filename))
print("Filtering sources in '{0}'...".format(filename))
print("Initial number of sources: {0}.".format(len(self.df)))
# Reject faint sources
if flux_lim != 0:
if self.flux_col is not None:
self.df = self.df[self.flux[self.name] > flux_lim]
if verbose:
print("Rejected (faint) sources below {0} Jy.".format(flux_lim))
print("Number remaining: {0}.".format(len(self.df)))
else:
warnings.warn_explicit("No int flux column given. Can't reject\
resolved sources based on flux.\n", UserWarning, WARN,
cf.f_lineno)
# Reject low S/N sources
if SNR != 0:
if self.flux_col is not None and self.rms_val is not None:
# reindex key fields before comparison
self.set_key_fields(indices=self.df.index.tolist())
self.df = self.df[self.flux[self.name] > SNR*self.rms[self.name]]
if verbose:
print("Rejected (low-S/N) sources < {0} x r.m.s.".format(SNR))
print("Number remaining: {0}.".format(len(self.df)))
else:
warnings.warn_explicit("rms or int flux column not given. Can't reject\
resolved sources based on SNR.\n", UserWarning, WARN, cf.f_lineno)
# Reject resolved sources based on flux ratio
if ratio_frac != 0:
if self.peak_col is not None:
self.df = self.df[self.df[self.flux_col]/self.df[self.peak_col] <= ratio_frac]
if verbose:
print("Rejected (resolved) sources with total flux > {0}\
times the peak flux.".format(ratio_frac))
print("Number remaining: {0}.".format(len(self.df)))
else:
warnings.warn_explicit("No peak flux column given. Can't reject resolved\
sources using flux ratio.\n", UserWarning, WARN, cf.f_lineno)
# Reject resolved sources based on flux ratio metric
if ratio_sigma != 0:
if self.peak_col is not None:
uncertainty = np.sqrt(self.df[self.flux_err_col]**2 + self.df[
self.peak_err_col]**2)
if self.finder == 'selavy':
uncertainty += np.sqrt(self.df[self.rms_val]**2)
diff = self.df[self.flux_col] - self.df[self.peak_col]
resolved = diff > ratio_sigma * uncertainty
self.df = self.df[~resolved]
if verbose:
print("Rejected (resolved) sources according to int/peak metric,\
above {0} sigma.".format(ratio_sigma))
print("Number remaining: {0}.".format(len(self.df)))
else:
warnings.warn_explicit("No peak flux column given. Can't reject resolved\
sources using flux ratio.\n", UserWarning, WARN, cf.f_lineno)
# Reject multi-component islands
if reject_blends:
if self.knownFinder:
island_counts = self.df[self.island_col].value_counts()
point_islands = island_counts[island_counts == 1].index
self.df = self.df[self.df[self.island_col].isin(point_islands)]
if verbose:
print("Rejected (resolved) sources belonging to a multi-component island.")
print("Number remaining: {0}.".format(len(self.df)))
else:
warnings.warn_explicit("Can't reject blends since finder isn't Aegean or\
Selavy or pyBDSM.\n", UserWarning, WARN, cf.f_lineno)
# Reject extended components based on component size
if psf_tol != 0:
if self.knownFinder:
if self.image is not None:
self.df = self.df[self.df[self.maj_col] <= psf_tol*self.image.bmaj]
elif self.finder == 'Aegean':
self.df = self.df[self.df['{0}_a'.format(self.name)]
<= psf_tol*self.df['{0}_psf_a'.format(self.name)]]
else:
warnings.warn_explicit("Can't rejected resolved sources based on psf tolerance without\
inputting radio_image object to read psf.\n", UserWarning, WARN,
cf.f_lineno)
if verbose:
print("Rejected (resolved) sources with fitted major axis\
> {0} times the psf major axis.".format(psf_tol))
print("Number remaining: {0}.".format(len(self.df)))
else:
warnings.warn_explicit("Can't reject sources based on PSF since finder\
isn't Aegean or Selavy or pyBDSM.\n", UserWarning,
WARN, cf.f_lineno)
# Reject sources with poor fit
if resid_tol != 0:
if self.finder == 'aegean':
# Reindex key fields before comparison
self.set_key_fields(indices=self.df.index.tolist())
self.df = self.df[self.df['{0}_residual_std'.format(self.name)]
<= resid_tol*self.rms[self.name]]
if verbose:
print("Rejected (poorly fit) sources with standard deviation in "
"residual > {0} times the rms.".format(resid_tol))
print("Number remaining: {0}.".format(len(self.df)))
else:
warnings.warn_explicit("Can't reject resolved sources based on residual "
"since finder isn't Aegean.\n", UserWarning,
WARN, cf.f_lineno)
# Reject sources with any flags
if flags:
if self.flag_col is not None:
self.df = self.df[self.df[self.flag_col] == 0]
if verbose:
print("Rejecting (problematic) sources flagged as bad.")
print("Number remaining: {0}.".format(len(self.df)))
else:
warnings.warn_explicit("Can't reject resolved sources based on flag since flag "
"column not set.\n", UserWarning, WARN, cf.f_lineno)
# Drop the rejected rows, reset the key fields and write to file
self.set_key_fields(indices=self.df.index.tolist(), set_coords=False)
self.write_df(write, filename)
# If file exists, simply read in catalogue
else:
self.overwrite_df(filename, step='filtering', verbose=verbose)
def cross_match(self, cat, radius='largest', join_type='1', redo=False, write=True):
"""Perform a nearest neighbour cross-match between this catalogue object
and another catalogue object. This will set update this object's catalogue
to the matched catalogue and add the key fields to the key field dictionaries.
Arguments:
----------
cat : catalogue
A catalogue object to cross-match this instance to.
Keyword arguments:
------------------
radius : string or float
The search radius in arcsec. Use 'largest' to use the larger of the two default
radii.
join_type : string
The join type of the two catalogues. '1' to keep all rows from this instance,
or '1and2' to keep only matched rows.
redo : bool
Perform the cross-matching, even if cross-matched file exists.
write : bool
Write the cross-matched catalogue to file."""
if cat.name in self.cat_list:
warnings.warn_explicit("You've already cross-matched to {0}. "
"Catalogue unchanged.\n".format(cat.name),
UserWarning, WARN, cf.f_lineno)
return
filename = '{0}_{1}.csv'.format(self.basename, cat.name)
# Cross-match and create file if it doesn't exist, otherwise open existing file
if redo or not os.path.exists(filename):
if len(self.df) == 0 or len(cat.df) == 0:
if self.verbose:
print('No {0} sources to match. Catalogue unchanged.'.format(cat.name))
return
print("---------------------------------")
print("| Cross-matching {0} and {1} |".format(self.name, cat.name))
print("---------------------------------")
if redo:
print("Re-doing cross-match.")
if write:
print("Overwriting '{0}'.".format(filename))
print("Cross-matching {0} {1} sources with {2} {3} sources.".format(len(self.df),
self.name,
len(cat.df),
cat.name))
# Force coordinates to be set
if len(self.coords) == 0:
self.set_key_fields()
if len(cat.coords) == 0:
cat.set_key_fields()
# Get sky coordinates and find nearest match of every source
# from this instance, independent of the search radius
c1 = self.coords[self.name]
c2 = cat.coords[cat.name]
indices, sep, sep3d = c1.match_to_catalog_sky(c2)
# Take the maximum radius from the two
if radius == 'largest':
radius = max(self.search_rad, cat.search_rad)
if self.verbose:
print('Using the largest of the two search radii of {0} arcsec.'.format(radius))
# Only take matched rows from cat, so self.df and cat.df are parallel
cat.df = cat.df.iloc[indices].reset_index(drop=True)
# Create pandas dataframe of the separations in arcsec
sep_col = '{0}_{1}_sep'.format(cat.name, self.name)
sepdf = pd.DataFrame(data={sep_col: sep.arcsec})
# Only take matches within search radius
indices = np.where(sep.arcsec < radius)[0]
# Only add cross-matched table when at least 1 match
if len(indices) >= 1:
print("Found {0} matches within {1} arcsec.".format(len(indices), radius))
# Don't reset indices so cat.df stays parallel with self.df
cat.df = cat.df.iloc[indices]
sepdf = sepdf.iloc[indices]
# Concatenate tables together according to match type
if join_type == '1and2':
matched_df = pd.concat([self.df, cat.df, sepdf], axis=1, join='inner')
elif join_type == '1':
matched_df = pd.concat([self.df, cat.df, sepdf],
axis=1).reindex(self.df.index)
# Reset indices and overwrite data frame with matched one
matched_df = matched_df.reset_index(drop=True)
# Set catalogue to matched df and write to file
self.overwrite_df(matched_df)
self.write_df(write, filename)
else:
print('{0} cross-matches between {1} and {2}. Catalogue unchanged.'.format(
len(indices), self.name, cat.name))
return
# If file exists, simply read in catalogue
else:
print("'{0}' already exists. Skipping cross-matching step.".format(filename))
print('Setting catalogue to this file.')