The following python program should do what you want, or something very close to it.
In the desired_output.txt
the 3rd line seems to be erroneous:
Mem_id#-aa3 : time- file1.txt value = ccx3 / file2.txt value= dd3
the dd3 should probably be
cc3`
Apart from that the output from the program matches except for whitespace, which seems a bit irregular in your sample output.
The input is considered to be ordered by key (memid)
- The program buffers by default 4 rows (max_diff + 1) while trying to sync up. If none from the keys right in that buffer matches the "current" key and vv both are considered non-matching and printed and the next pair is tried. If a key is found, the non-matching items in the other buffer or output first.
The sample input is a bit restricted on what behaviour is expected when the first and the second line have the same memid twice (or more).
In output()
I try to match
any rows and pop all matching (both from left and right). Therefore the order of matching lines within the same memid is not important. If left or right or both are empty afterwards printing is easy (especially when both are empty). For the rest the I match each remaining line from the left to the right.
The fmt
string in line_out()
determines the output, you can freely change/reorder that.
#! /usr/bin/env python
# coding: utf-8
# http://unix.stackexchange.com/q/161913/33055
from __future__ import print_function
from collections import OrderedDict
from logging import debug
import sys
class RowBuffer:
def __init__(self, file_name, delim=None, max_diff=3):
"""delim is the character that is used for splitting input.
None->whitespace
"""
self._verbose = 0
self._file_name = file_name
self._fp = open(self._file_name)
self._delim = delim
self._max_diff = max_diff
self._head = self._fp.readline().split(delim)
# the buffer consists of a maximum of max_diff entries
# the keys are the first items of a row, the value a list
# of all other items on that row
self._buffer = OrderedDict()
self.fill_buffer()
def compare(self, rb):
"""check if self._buffer"""
if self._head != rb._head:
print('headings differ:\n {}\n {}'.format(
self._head, rb._head))
while self._buffer:
l = self.get()
try:
r = rb.get()
except KeyError:
debug('only left %s', l[0])
self.output(l, None, rb)
break
if l[0] == r[0]:
debug('compare vals %s', l[0])
self.output(l, r, rb)
continue
if l[0] in rb:
# left key in right, but not at top
# output right until top keys are same
while l[0] != r[0]:
debug('only right %s', r[0])
self.output(None, r, rb)
r = rb.get()
self.output(l, r, rb)
continue
if r[0] in self:
# right key in left, but not at top
# output left until top keys are same
while l[0] != r[0]:
debug('only left %s', l[0])
self.output(l, None, rb)
l = self.get()
self.output(l, r, rb)
continue
# neither found: output both
debug('neither left in right nor vv %s %s', l[0], r[0])
self.output(l, None, rb)
self.output(None, r, rb)
while rb._buffer: # remaining in right file
r = rb.get()
debug('only right %s', r[0])
self.output(None, r, rb)
def output(self, l, r, right):
fmt1 = '{col0_header}#-{col0_value} : {col_header}- ' \
'{left_file_name} value = {left_value} / ' \
'{right_file_name} value= {right_value}'
d = dict(
col0_header=self._head[0],
left_file_name=self._file_name,
right_file_name=right._file_name,
)
if l is not None and r is not None:
# one or more values on both sides, compare all lines on the
# left with all on the right remove any matching pairs
match = {} # left index to right index
for lidx, lv in enumerate(l[1]):
for ridx, rv in enumerate(r[1]):
if lv == rv:
if lidx not in match:
match[lidx] = ridx
# pop from back of list, not invalidate index
for lidx in sorted(match, reverse=True):
l[1].pop(lidx)
for ridx in sorted(match.values(), reverse=True):
r[1].pop(lidx)
if r is None or not r[1]:
for lv in l[1]:
for idx, k in enumerate(self._head[1:]):
self.line_out(d, col0_value=l[0], col_header=k,
left_value=lv[idx], right_value=' ')
return
if l is None or not l[1]:
for rv in r[1]:
for idx, k in enumerate(self._head[1:]):
self.line_out(d, col0_value=l[0], col_header=k,
left_value=' ', right_value=rv[idx])
return
# print non matching
for lv in l[1]:
for rv in r[1]:
for idx, k in enumerate(self._head[1:]):
if lv[idx] == rv[idx]:
continue # same value
self.line_out(d, col0_value=l[0], col_header=k,
left_value=lv[idx], right_value=rv[idx])
def line_out(self, d, **kw):
# manipulate and print output
# the fields of the format string can be arbitrarily arranged
# as long as the field names (between {} match)
fmt = '{col0_header}#-{col0_value} : {col_header}- ' \
'{left_file_name} value = {left_value} / ' \
'{right_file_name} value= {right_value}'
d1 = d.copy()
d1.update(kw)
s = fmt.format(**d1)
# s = s.rstrip()
s = s[0].upper() + s[1:] # sample output doesn't match input
print(s)
def get(self):
item = self._buffer.popitem(last=False)
self.fill_buffer()
return item
def fill_buffer(self):
if self._fp is None:
return
while len(self._buffer) < self._max_diff:
row = self._fp.readline().split(self._delim)
if not row:
self._fp.close()
self._fp = None
return
entry = self._buffer.setdefault(row[0], [])
entry.append(row[1:])
def __contains__(self, key):
self.fill_buffer()
return key in self._buffer
rb1 = RowBuffer(sys.argv[1])
rb2 = RowBuffer(sys.argv[2])
rb1.compare(rb2)
If the files are sorted (the samples you posted are) then it's as simple as
join -t : File1.txt File2.txt
join
pairs up lines from two files where the join field is equal. By default, the join field is the first field, the fields are output in order except that the join field is not repeated, and non-pairable lines are skipped, which is exactly what you want.
Note that if the files have Windows line endings, they appear under Unix systems to have an extra carriage return character at the end of each line. The CR is mostly visually invisible, but as far as join
and other text tools are concerned, it's a character like any one else, and it means the fields of File1.txt
all end with a CR whereas the ones in File2.txt
don't so they don't match. You need to strip the CR, at least in File1.txt
.
<File1.txt tr -d '\r' | join -t : - File2.txt
You do need to sort the files. If they aren't, then ksh/bash/zsh, you can use process substitutions. (Add tr -d '\r' |
if needed.)
join -t : <(sort File1.txt) <(sort File2.txt)
In plain sh, if your Unix variant has /dev/fd
(most do), you can use that instead to pipe the output of two programs through two file descriptors.
sort File2.txt | { sort File1.txt | join -t : /dev/fd/0 /dev/fd/3; } 3<&1
If you need to preserve the original order of File1.txt
and it isn't sorted by the join field, then add line numbers to remember the original order, sort by the join field, join, sort by line numbers and strip the line numbers. (You can do something similar if you want to preserver the order of the other file.)
<File1.txt nl -s : |
sort -t : -k 2 |
join -t : -1 2 - <(sort File2.txt) |
sort -t : -k 2,2n |
cut -d : -f 1,3
Best Answer
Using
awk
:Output: